From eb5cdd9008011417acd3d30aae3aecf65fd68878 Mon Sep 17 00:00:00 2001 From: bergvca Date: Sun, 11 Apr 2021 21:33:26 +0200 Subject: [PATCH 01/29] Added changelog with all changes since version 0.3.2 --- CHANGELOG.md | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..d1cb63ff --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,31 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +## [0.4.0] - 2021-04-11 + +### Added + +* Added group representative functionality - by default the centroid is used. From [@ParticularMiner](https://github.com/ParticularMiner) +* Added string_grouper_utils package with additional group-representative functionality: + * new_group_rep_by_earliest_timestamp + * new_group_rep_by_completeness + * new_group_rep_by_highest_weight + + From [@ParticularMiner](https://github.com/ParticularMiner) +* Original indices are now added by default to output of `group_similar_strings`, `match_most_similar` and `match_strings`. + From [@ParticularMiner](https://github.com/ParticularMiner) +* `compute_pairwise_similarities` function From [@ParticularMiner](https://github.com/ParticularMiner) + +### Changed + +* Default group representative is now the centroid. Used to be the first string in the series belonging to a group. + From [@ParticularMiner](https://github.com/ParticularMiner) +* Output of `match_most_similar` and `match_strings` is now a `pandas.DataFrame` object instead of a `pandas.Series` +by default. From [@ParticularMiner](https://github.com/ParticularMiner) +* Fixed a bug which occurs when min_similarity=0. From [@ParticularMiner](https://github.com/ParticularMiner) \ No newline at end of file From 5cb0066367228bba8c1f308c726d93e0728214ed Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Mon, 12 Apr 2021 21:34:34 +0200 Subject: [PATCH 02/29] added StringGrouper attribute function _get_true_max_n_matches() and removed kwarg suppress_warning. An error is now raised when max_n_matches is too small. --- README.md | 3 +- string_grouper/string_grouper.py | 62 ++++++++++++++-------- string_grouper/test/test_string_grouper.py | 6 +-- 3 files changed, 42 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index d1579a48..7faa5239 100644 --- a/README.md +++ b/README.md @@ -142,8 +142,7 @@ All functions are built using a class **StringGrouper**. This class `number of cores on a machine - 1.` * **ignore_index**: Determines whether indexes are ignored or not. If `False` (the default), index-columns will appear in the output, otherwise not. (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md) for a demonstration.) * **replace_na**: For function match_most_similar, determines whether `NaN` values in index-columns are replaced or not by index-labels from duplicates. Defaults to `False`. (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md) for a demonstration.) - * **include_zeroes**: When min_similarity ≤ 0, determines whether zero-similarity matches appear in the output. Defaults to `True`. (See [tutorials/zero_similarity.md](tutorials/zero_similarity.md) for a demonstration.) **Warning:** Make sure the kwarg `max_n_matches` is sufficiently high to capture ***all*** nonzero-similarity-matches, otherwise some zero-similarity-matches returned will be false. - * **suppress_warning**: when min_similarity ≤ 0 and include_zeroes is `True`, determines whether or not to suppress the message warning that max_n_matches may be too small. Defaults to `False`. + * **include_zeroes**: When min_similarity ≤ 0, determines whether zero-similarity matches appear in the output. Defaults to `True`. (See [tutorials/zero_similarity.md](tutorials/zero_similarity.md).) **Note:** If include_zeroes is `True` and the kwarg max_n_matches is set then it must be sufficiently high to capture ***all*** nonzero-similarity-matches, otherwise an error is raised and string_grouper suggests an alternative value for max_n_matches. To allow string_grouper to automatically use the appropriate value for max_n_matches then do not set this kwarg at all. * **group_rep**: For function group_similar_strings, determines how group-representatives are chosen. Allowed values are `'centroid'` (the default) and `'first'`. See [tutorials/group_representatives.md](tutorials/group_representatives.md) for an explanation. ## Examples diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index 8485f5c2..21ead95d 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -8,7 +8,6 @@ from typing import Tuple, NamedTuple, List, Optional, Union from sparse_dot_topn import awesome_cossim_topn from functools import wraps -import warnings DEFAULT_NGRAM_SIZE: int = 3 DEFAULT_REGEX: str = r'[,-./]|\s' @@ -21,9 +20,6 @@ # similar string index-columns with corresponding duplicates-index values DEFAULT_INCLUDE_ZEROES: bool = True # when the minimum cosine similarity <=0, determines whether zero-similarity # matches appear in the output -DEFAULT_SUPPRESS_WARNING: bool = False # when the minimum cosine similarity <=0 and zero-similarity matches are - # requested, determines whether or not to suppress the message warning that - # max_n_matches may be too small GROUP_REP_CENTROID: str = 'centroid' # Option value to select the string in each group with the largest # similarity aggregate as group-representative: GROUP_REP_FIRST: str = 'first' # Option value to select the first string in each group as group-representative: @@ -153,8 +149,6 @@ class StringGrouperConfig(NamedTuple): :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to False. :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches appear in the output. Defaults to True. - :param suppress_warning: when min_similarity <=0 and include_zeroes=True, determines whether or not to supress - the message warning that max_n_matches may be too small. Defaults to False. :param replace_na: whether or not to replace NaN values in most similar string index-columns with corresponding duplicates-index values. Defaults to False. :param group_rep: str. The scheme to select the group-representative. Default is 'centroid'. @@ -163,13 +157,12 @@ class StringGrouperConfig(NamedTuple): ngram_size: int = DEFAULT_NGRAM_SIZE regex: str = DEFAULT_REGEX - max_n_matches: int = DEFAULT_MAX_N_MATCHES + max_n_matches: Optional[int] = None min_similarity: float = DEFAULT_MIN_SIMILARITY number_of_processes: int = DEFAULT_N_PROCESSES ignore_case: bool = DEFAULT_IGNORE_CASE ignore_index: bool = DEFAULT_DROP_INDEX include_zeroes: bool = DEFAULT_INCLUDE_ZEROES - suppress_warning: bool = DEFAULT_SUPPRESS_WARNING replace_na: bool = DEFAULT_REPLACE_NA group_rep: str = DEFAULT_GROUP_REP @@ -226,12 +219,17 @@ def __init__(self, master: pd.Series, self._master_id: pd.Series = master_id if master_id is not None else None self._duplicates_id: pd.Series = duplicates_id if duplicates_id is not None else None self._config: StringGrouperConfig = StringGrouperConfig(**kwargs) + self._max_n_matches = DEFAULT_MAX_N_MATCHES if self._config.max_n_matches is None \ + else self._config.max_n_matches self._validate_group_rep_specs() self._validate_replace_na_and_drop() self.is_build = False # indicates if the grouper was fit or not self._vectorizer = TfidfVectorizer(min_df=1, analyzer=self.n_grams) - # After the StringGrouper is build, _matches_list will contain the indices and similarities of two matches + # After the StringGrouper is built, _matches_list will contain the indices and similarities of two matches + # and _true_max_n_matches will contain the true maximum number of matches over all strings in master if + # self._config.min_similarity <= 0 self._matches_list: pd.DataFrame = pd.DataFrame() + self._true_max_n_matches = None def n_grams(self, string: str) -> List[str]: """ @@ -271,8 +269,7 @@ def dot(self) -> pd.Series: @validate_is_fit def get_matches(self, ignore_index: Optional[bool] = None, - include_zeroes: Optional[bool]=None, - suppress_warning: Optional[bool]=None) -> pd.DataFrame: + include_zeroes: Optional[bool]=None) -> pd.DataFrame: """ Returns a DataFrame with all the matches and their cosine similarity. If optional IDs are used, returned as extra columns with IDs matched to respective data rows @@ -281,8 +278,6 @@ def get_matches(self, self._config.ignore_index. :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches appear in the output. Defaults to self._config.include_zeroes. - :param suppress_warning: when min_similarity <=0 and include_zeroes=True, determines whether or not to suppress - the message warning that max_n_matches may be too small. Defaults to self._config.suppress_warning. """ def get_both_sides(master: pd.Series, duplicates: pd.Series, @@ -306,14 +301,13 @@ def prefix_column_names(data: Union[pd.Series, pd.DataFrame], prefix: str): if ignore_index is None: ignore_index = self._config.ignore_index if include_zeroes is None: include_zeroes = self._config.include_zeroes - if suppress_warning is None: suppress_warning = self._config.suppress_warning if self._config.min_similarity > 0 or not include_zeroes: matches_list = self._matches_list elif include_zeroes: # Here's a fix to a bug pointed out by one GitHub user (@nbcvijanovic): # the fix includes zero-similarity matches that are missing by default # in _matches_list due to our use of sparse matrices - non_matches_list = self._get_non_matches_list(suppress_warning) + non_matches_list = self._get_non_matches_list() matches_list = self._matches_list if non_matches_list.empty else \ pd.concat([self._matches_list, non_matches_list], axis=0, ignore_index=True) @@ -437,6 +431,12 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix """Builds the cossine similarity matrix of two csr matrices""" tf_idf_matrix_1 = master_matrix tf_idf_matrix_2 = duplicate_matrix.transpose() + + # if min_similarity <= 0 compute the true maximum number of matches over all strings in master: + if self._config.min_similarity <= 0: + self._true_max_n_matches = StringGrouper._get_true_max_n_matches(tf_idf_matrix_1, tf_idf_matrix_2) + if self._config.max_n_matches is None: + self._max_n_matches = self._true_max_n_matches optional_kwargs = dict() if self._config.number_of_processes > 1: @@ -446,7 +446,7 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix } return awesome_cossim_topn(tf_idf_matrix_1, tf_idf_matrix_2, - self._config.max_n_matches, + self._max_n_matches, self._config.min_similarity, **optional_kwargs) @@ -462,23 +462,39 @@ def _symmetrize_matches_list(self): ).set_index(['master_side', 'dupe_side']) ).reset_index() - def _get_non_matches_list(self, suppress_warning=False) -> pd.DataFrame: + def _get_non_matches_list(self) -> pd.DataFrame: """Returns a list of all the indices of non-matching pairs (with similarity set to 0)""" m_sz, d_sz = len(self._master), len(self._master if self._duplicates is None else self._duplicates) all_pairs = pd.MultiIndex.from_product([range(m_sz), range(d_sz)], names=['master_side', 'dupe_side']) matched_pairs = pd.MultiIndex.from_frame(self._matches_list[['master_side', 'dupe_side']]) missing_pairs = all_pairs.difference(matched_pairs) if missing_pairs.empty: return pd.DataFrame() - if (self._config.max_n_matches < d_sz) and not suppress_warning: - warnings.warn(f'WARNING: max_n_matches={self._config.max_n_matches} may be too small!\n' - f'\t\t Some zero-similarity matches returned may be false!\n' - f'\t\t To be absolutely certain all zero-similarity matches are true,\n' - f'\t\t try setting max_n_matches={d_sz} (the length of the Series parameter duplicates).\n' - f'\t\t To suppress this warning, set suppress_warning=True.') + if (self._max_n_matches < self._true_max_n_matches): + raise Exception(f'\nERROR: Cannot return zero-similarity matches since \n' + f'\t\t max_n_matches={self._max_n_matches} is too small!\n' + f'\t\t Try setting max_n_matches={self._true_max_n_matches} (the \n' + f'\t\t true maximum number of matches over all strings in master)\n' + f'\t\t or greater or do not set this kwarg at all.') missing_pairs = missing_pairs.to_frame(index=False) missing_pairs['similarity'] = 0 return missing_pairs + @staticmethod + def _get_true_max_n_matches(AA: csr_matrix, BB: csr_matrix) -> int: + """Returns the true maximum number of matches over all strings in master""" + def get_n_matches(i: int) -> int: + a_cols = A.indices[A.indptr[i]:A.indptr[i+1]] + nz = np.full(N, 0, dtype=int) + for j in a_cols: + nz[B.indices[B.indptr[j]:B.indptr[j+1]]] = 1 + return np.sum(nz) + + A, B = AA.tocsr(), BB.tocsr() + M, _ = A.shape + _, N = B.shape + v = np.vectorize(get_n_matches) + return np.amax(v(range(M))) + @staticmethod def _get_matches_list(matches) -> pd.DataFrame: """Returns a list of all the indices of matches""" diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py index 723d3f22..cbc4bcae 100644 --- a/string_grouper/test/test_string_grouper.py +++ b/string_grouper/test/test_string_grouper.py @@ -9,7 +9,6 @@ match_most_similar, group_similar_strings, match_strings,\ compute_pairwise_similarities from unittest.mock import patch -import warnings class SimpleExample(object): @@ -93,7 +92,7 @@ def test_config_defaults(self): """Empty initialisation should set default values""" config = StringGrouperConfig() self.assertEqual(config.min_similarity, DEFAULT_MIN_SIMILARITY) - self.assertEqual(config.max_n_matches, DEFAULT_MAX_N_MATCHES) + self.assertEqual(config.max_n_matches, None) self.assertEqual(config.regex, DEFAULT_REGEX) self.assertEqual(config.ngram_size, DEFAULT_NGRAM_SIZE) self.assertEqual(config.number_of_processes, DEFAULT_N_PROCESSES) @@ -253,7 +252,7 @@ def test_zero_min_similarity(self): simple_example = SimpleExample() s_master = simple_example.customers_df['Customer Name'] s_dup = simple_example.whatever_series_1 - matches = match_strings(s_master, s_dup, max_n_matches=len(s_master), min_similarity=0) + matches = match_strings(s_master, s_dup, min_similarity=0) pd.testing.assert_frame_equal(simple_example.expected_result_with_zeroes, matches) def test_zero_min_similarity_small_max_n_matches(self): @@ -262,7 +261,6 @@ def test_zero_min_similarity_small_max_n_matches(self): simple_example = SimpleExample() s_master = simple_example.customers_df['Customer Name'] s_dup = simple_example.two_strings - warnings.simplefilter('error', UserWarning) with self.assertRaises(Exception): _ = match_strings(s_master, s_dup, max_n_matches=1, min_similarity=0) From f9f1868c15588ae43a84e2d1bb7289fad24c4eb9 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Wed, 14 Apr 2021 12:55:57 +0200 Subject: [PATCH 03/29] modified ing-bank's sparse_dot_topn to get n_max_matches true value and made significant performance enhancements: 1. _symmetrize_matrix instead of _symmetrize_matches_list (boost x5) 2. _get_matches_list (boost x33) --- sparse_dot_topn/.gitignore | 4 + sparse_dot_topn/__init__.py | 7 + sparse_dot_topn/awesome_cossim_minmax_topn.py | 92 +++++ sparse_dot_topn/awesome_cossim_topn.py | 89 +++++ sparse_dot_topn/sparse_dot_topn.pyx | 160 ++++++++ sparse_dot_topn/sparse_dot_topn_parallel.cpp | 366 ++++++++++++++++++ sparse_dot_topn/sparse_dot_topn_parallel.h | 55 +++ sparse_dot_topn/sparse_dot_topn_source.cpp | 243 ++++++++++++ sparse_dot_topn/sparse_dot_topn_source.h | 57 +++ sparse_dot_topn/sparse_dot_topn_threaded.pyx | 120 ++++++ string_grouper/string_grouper.py | 65 ++-- string_grouper/test/test_string_grouper.py | 11 +- 12 files changed, 1228 insertions(+), 41 deletions(-) create mode 100644 sparse_dot_topn/.gitignore create mode 100644 sparse_dot_topn/__init__.py create mode 100644 sparse_dot_topn/awesome_cossim_minmax_topn.py create mode 100644 sparse_dot_topn/awesome_cossim_topn.py create mode 100644 sparse_dot_topn/sparse_dot_topn.pyx create mode 100644 sparse_dot_topn/sparse_dot_topn_parallel.cpp create mode 100644 sparse_dot_topn/sparse_dot_topn_parallel.h create mode 100644 sparse_dot_topn/sparse_dot_topn_source.cpp create mode 100644 sparse_dot_topn/sparse_dot_topn_source.h create mode 100644 sparse_dot_topn/sparse_dot_topn_threaded.pyx diff --git a/sparse_dot_topn/.gitignore b/sparse_dot_topn/.gitignore new file mode 100644 index 00000000..d40e00a1 --- /dev/null +++ b/sparse_dot_topn/.gitignore @@ -0,0 +1,4 @@ +/sparse_dot_topn.cp39-win_amd64.pyd +/sparse_dot_topn_threaded.cp39-win_amd64.pyd +/sparse_dot_topn.cpp +/sparse_dot_topn_threaded.cpp diff --git a/sparse_dot_topn/__init__.py b/sparse_dot_topn/__init__.py new file mode 100644 index 00000000..09d2bfa7 --- /dev/null +++ b/sparse_dot_topn/__init__.py @@ -0,0 +1,7 @@ +# flake8: noqa +import sys + +if sys.version_info[0] >= 3: + from sparse_dot_topn.awesome_cossim_minmax_topn import awesome_cossim_minmax_topn +else: + from awesome_cossim_minmax_topn import awesome_cossim_minmax_topn \ No newline at end of file diff --git a/sparse_dot_topn/awesome_cossim_minmax_topn.py b/sparse_dot_topn/awesome_cossim_minmax_topn.py new file mode 100644 index 00000000..92fdf87f --- /dev/null +++ b/sparse_dot_topn/awesome_cossim_minmax_topn.py @@ -0,0 +1,92 @@ +import sys +import numpy as np +from scipy.sparse import csr_matrix +from scipy.sparse import isspmatrix_csr + +if sys.version_info[0] >= 3: + from sparse_dot_topn import sparse_dot_topn as ct + from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread +# else: + # import sparse_dot_topn as ct + # import sparse_dot_topn_threaded as ct_thread + + +def awesome_cossim_minmax_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1): + """ + This function will return a matrxi C in CSR format, where + C = [sorted top n results and results > lower_bound for each row of A * B] + + Input: + A and B: two CSR matrix + ntop: n top results + lower_bound: a threshold that the element of A*B must greater than + use_threads: use multi-thread or not + n_jobs: number of thread, must be >= 1 + + Output: + C: result matrix + + N.B. if A and B are not CSR format, they will be converted to CSR + """ + if not isspmatrix_csr(A): + A = A.tocsr() + + if not isspmatrix_csr(B): + B = B.tocsr() + + M, K1 = A.shape + K2, N = B.shape + + if K1 != K2: + err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!' + raise ValueError(err_str) + + idx_dtype = np.int32 + + nnz_max = M*ntop + + # basic check. if A or B are all zeros matrix, return all zero matrix directly + if len(A.indices) == 0 or len(B.indices) == 0: + indptr = np.zeros(M + 1, dtype=idx_dtype) + indices = np.zeros(nnz_max, dtype=idx_dtype) + data = np.zeros(nnz_max, dtype=A.dtype) + return 0, csr_matrix((data, indices, indptr), shape=(M, N)) + + # filled matrices from here on + indptr = np.empty(M+1, dtype=idx_dtype) + indices = np.empty(nnz_max, dtype=idx_dtype) + data = np.empty(nnz_max, dtype=A.dtype) + + minmax_topn = np.full(1, 0, dtype=idx_dtype) + + if not use_threads: + + ct.sparse_dot_minmax_topn( + M, N, np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + A.data, + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + B.data, + ntop, + lower_bound, + indptr, indices, data, + minmax_topn) + + else: + if n_jobs < 1: + err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!' + raise ValueError(err_str) + + ct_thread.sparse_dot_minmax_topn_threaded( + M, N, np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + A.data, + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + B.data, + ntop, + lower_bound, + indptr, indices, data, minmax_topn, n_jobs) + + return minmax_topn[0], csr_matrix((data, indices, indptr), shape=(M, N)) diff --git a/sparse_dot_topn/awesome_cossim_topn.py b/sparse_dot_topn/awesome_cossim_topn.py new file mode 100644 index 00000000..c4af03d4 --- /dev/null +++ b/sparse_dot_topn/awesome_cossim_topn.py @@ -0,0 +1,89 @@ +import sys +import numpy as np +from scipy.sparse import csr_matrix +from scipy.sparse import isspmatrix_csr + +if sys.version_info[0] >= 3: + from sparse_dot_topn import sparse_dot_topn as ct + from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread +else: + import sparse_dot_topn as ct + import sparse_dot_topn_threaded as ct_thread + + +def awesome_cossim_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1): + """ + This function will return a matrxi C in CSR format, where + C = [sorted top n results and results > lower_bound for each row of A * B] + + Input: + A and B: two CSR matrix + ntop: n top results + lower_bound: a threshold that the element of A*B must greater than + use_threads: use multi-thread or not + n_jobs: number of thread, must be >= 1 + + Output: + C: result matrix + + N.B. if A and B are not CSR format, they will be converted to CSR + """ + if not isspmatrix_csr(A): + A = A.tocsr() + + if not isspmatrix_csr(B): + B = B.tocsr() + + M, K1 = A.shape + K2, N = B.shape + + if K1 != K2: + err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!' + raise ValueError(err_str) + + idx_dtype = np.int32 + + nnz_max = M*ntop + + # basic check. if A or B are all zeros matrix, return all zero matrix directly + if len(A.indices) == 0 or len(B.indices) == 0: + indptr = np.zeros(M + 1, dtype=idx_dtype) + indices = np.zeros(nnz_max, dtype=idx_dtype) + data = np.zeros(nnz_max, dtype=A.dtype) + return csr_matrix((data, indices, indptr), shape=(M, N)) + + # filled matrices from here on + indptr = np.empty(M+1, dtype=idx_dtype) + indices = np.empty(nnz_max, dtype=idx_dtype) + data = np.empty(nnz_max, dtype=A.dtype) + + if not use_threads: + + ct.sparse_dot_topn( + M, N, np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + A.data, + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + B.data, + ntop, + lower_bound, + indptr, indices, data) + + else: + if n_jobs < 1: + err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!' + raise ValueError(err_str) + + ct_thread.sparse_dot_topn_threaded( + M, N, np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + A.data, + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + B.data, + ntop, + lower_bound, + indptr, indices, data, n_jobs) + + return csr_matrix((data, indices, indptr), shape=(M, N)) diff --git a/sparse_dot_topn/sparse_dot_topn.pyx b/sparse_dot_topn/sparse_dot_topn.pyx new file mode 100644 index 00000000..1da3181a --- /dev/null +++ b/sparse_dot_topn/sparse_dot_topn.pyx @@ -0,0 +1,160 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at# +# http://www.apache.org/licenses/LICENSE-2.0# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Zhe Sun, Ahmet Erdem +# April 20, 2017 + +# distutils: language = c++ + +import numpy as np +cimport numpy as np + +cdef extern from "sparse_dot_topn_source.h": + + cdef void sparse_dot_topn_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int topn, + double lower_bound, + int Cp[], + int Cj[], + double Cx[]); + + cdef void sparse_dot_minmax_topn_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int topn, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int minmax_topn[]); + +cpdef sparse_dot_topn( + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr, + np.ndarray[int, ndim=1] c_indices, + np.ndarray[double, ndim=1] c_data + ): + """ + Cython glue function to call sparse_dot_topn C++ implementation + This function will return a matrxi C in CSR format, where + C = [sorted top n results and results > lower_bound for each row of A * B] + + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix + + a_indptr, a_indices, a_data: CSR expression of A matrix + b_indptr, b_indices, b_data: CSR expression of B matrix + + ntop: n top results + lower_bound: a threshold that the element of A*B must greater than + + Output by reference: + c_indptr, c_indices, c_data: CSR expression of C matrix + + N.B. A and B must be CSR format!!! + The type of input numpy array must be aligned with types of C++ function aguments! + """ + + cdef int* Ap = &a_indptr[0] + cdef int* Aj = &a_indices[0] + cdef double* Ax = &a_data[0] + cdef int* Bp = &b_indptr[0] + cdef int* Bj = &b_indices[0] + cdef double* Bx = &b_data[0] + cdef int* Cp = &c_indptr[0] + cdef int* Cj = &c_indices[0] + cdef double* Cx = &c_data[0] + + sparse_dot_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx) + return + +cpdef sparse_dot_minmax_topn( + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr, + np.ndarray[int, ndim=1] c_indices, + np.ndarray[double, ndim=1] c_data, + np.ndarray[int, ndim=1] minmax_topn + ): + """ + Cython glue function to call sparse_dot_minmax_topn C++ implementation + This function will return a matrix C in CSR format, where + C = [sorted top n results > lower_bound for each row of A * B]. + It also returns minmax_ntop (the maximum number of columns set + for each row of A * B when ntop is infinite) + + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix + + a_indptr, a_indices, a_data: CSR expression of A matrix + b_indptr, b_indices, b_data: CSR expression of B matrix + + ntop: n top results + lower_bound: a threshold that the element of A*B must greater than + + Output by reference: + c_indptr, c_indices, c_data: CSR expression of C matrix + minmax_ntop: the maximum number of columns set for each row of + A * B when ntop is infinite + + N.B. A and B must be CSR format!!! + The type of input numpy array must be aligned with types of C++ function aguments! + """ + + cdef int* Ap = &a_indptr[0] + cdef int* Aj = &a_indices[0] + cdef double* Ax = &a_data[0] + cdef int* Bp = &b_indptr[0] + cdef int* Bj = &b_indices[0] + cdef double* Bx = &b_data[0] + cdef int* Cp = &c_indptr[0] + cdef int* Cj = &c_indices[0] + cdef double* Cx = &c_data[0] + cdef int* o_minmax_topn = &minmax_topn[0] + + sparse_dot_minmax_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, o_minmax_topn) + return \ No newline at end of file diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/sparse_dot_topn/sparse_dot_topn_parallel.cpp new file mode 100644 index 00000000..dc123b80 --- /dev/null +++ b/sparse_dot_topn/sparse_dot_topn_parallel.cpp @@ -0,0 +1,366 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: Zhe Sun, Ahmet Erdem +// April 20, 2017 + +#include +#include +#include +#include + +#include "./sparse_dot_topn_source.h" +#include "./sparse_dot_topn_parallel.h" + +void inner_sparse_function(int start_row, int end_row, int n_col_inner, + int ntop_inner, double lower_bound_inner, int Ap_copy[], + int Aj_copy[], double Ax_copy[], int Bp_copy[], int Bj_copy[], + double Bx_copy[], std::vector real_candidates[]) +{ + +std::vector next(n_col_inner,-1); +std::vector sums(n_col_inner, 0); + +std::vector temp_candidates; + +int iterations_count = 0; + +for(int i = start_row; i < end_row; i++){ + + iterations_count += 1; + + int head = -2; + int length = 0; + + int jj_start = Ap_copy[i]; + int jj_end = Ap_copy[i+1]; + + for(int jj = jj_start; jj < jj_end; jj++){ + int j = Aj_copy[jj]; + double v = Ax_copy[jj]; //value of A in (i,j) + + int kk_start = Bp_copy[j]; + int kk_end = Bp_copy[j+1]; + for(int kk = kk_start; kk < kk_end; kk++){ + int k = Bj_copy[kk]; //kth column of B in row j + + sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i + + if(next[k] == -1){ + next[k] = head; //keep a linked list, every element points to the next column index + head = k; + length++; + } + } + } + + for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) + + if(sums[head] > lower_bound_inner){ //append the nonzero elements + candidate c; + c.index = head; + c.value = sums[head]; + temp_candidates.push_back(c); + } + + int temp = head; + head = next[head]; //iterate over columns + + next[temp] = -1; //clear arrays + sums[temp] = 0; //clear arrays + } + + int len = (int)temp_candidates.size(); + if (len > ntop_inner){ + std::partial_sort(temp_candidates.begin(), + temp_candidates.begin()+ntop_inner, + temp_candidates.end(), + candidate_cmp); + len = ntop_inner; + } else { + std::sort(temp_candidates.begin(), + temp_candidates.end(), candidate_cmp); + } + + + temp_candidates.resize(len); + real_candidates[i] = temp_candidates; + + temp_candidates.clear(); + +} + +} + +void sparse_dot_topn_parallel(int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], //data of A + int Bp[], + int Bj[], + double Bx[], //data of B + int ntop, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int n_jobs) +{ + + Cp[0] = 0; + + int split_amount = n_row / n_jobs; + + std::vector> split_row_vector(n_jobs); + + std::vector> real_candidates(n_row); + + std::vector *real_cand_pointer; + real_cand_pointer = &real_candidates[0]; + + std::vector thread_list(n_jobs); + + + for (int job_nr = 0; job_nr < n_jobs; job_nr++) { + std::vector temp_vector(2, 0); + + int start_split = job_nr * split_amount; + int end_split = start_split + split_amount; + + if (job_nr == n_jobs -1) { + end_split = n_row; + } + + temp_vector[0] = start_split; + temp_vector[1] = end_split; + + split_row_vector[job_nr] = temp_vector; + + } + + + for (int job_nr = 0; job_nr < n_jobs; job_nr++) { + + + int start_row = split_row_vector[job_nr][0]; + int end_row = split_row_vector[job_nr][1]; + + + thread_list[job_nr] = std::thread (inner_sparse_function, start_row, + end_row, n_col, ntop, lower_bound, + Ap, Aj, Ax, Bp, Bj, Bx, + real_cand_pointer); + + } + + for (int job_nr = 0; job_nr < n_jobs; job_nr++) { + thread_list[job_nr].join(); + } + + int nnz = 0; + + for (int m = 0; m < n_row; m++) { + + std::vector cand = real_cand_pointer[m]; + + int can_len = (int)cand.size(); + + for(int can_nr=0; can_nr < can_len; can_nr++){ + Cj[nnz] = cand[can_nr].index; + Cx[nnz] = cand[can_nr].value; + nnz++; + } + + Cp[m+1] = nnz; + + } + +} + +void inner_sparse_minmax_function(int start_row, int end_row, int n_col_inner, + int ntop_inner, double lower_bound_inner, int Ap_copy[], + int Aj_copy[], double Ax_copy[], int Bp_copy[], int Bj_copy[], + double Bx_copy[], std::vector real_candidates[], + int *minmax_ntop) +{ + +std::vector next(n_col_inner,-1); +std::vector sums(n_col_inner, 0); + +std::vector temp_candidates; + +int iterations_count = 0; + +for(int i = start_row; i < end_row; i++){ + + iterations_count += 1; + + int head = -2; + int length = 0; + + int jj_start = Ap_copy[i]; + int jj_end = Ap_copy[i+1]; + + for(int jj = jj_start; jj < jj_end; jj++){ + int j = Aj_copy[jj]; + double v = Ax_copy[jj]; //value of A in (i,j) + + int kk_start = Bp_copy[j]; + int kk_end = Bp_copy[j+1]; + for(int kk = kk_start; kk < kk_end; kk++){ + int k = Bj_copy[kk]; //kth column of B in row j + + sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i + + if(next[k] == -1){ + next[k] = head; //keep a linked list, every element points to the next column index + head = k; + length++; + } + } + } + *minmax_ntop = (length > *minmax_ntop)? length : *minmax_ntop; + + for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) + + if(sums[head] > lower_bound_inner){ //append the nonzero elements + candidate c; + c.index = head; + c.value = sums[head]; + temp_candidates.push_back(c); + } + + int temp = head; + head = next[head]; //iterate over columns + + next[temp] = -1; //clear arrays + sums[temp] = 0; //clear arrays + } + + int len = (int)temp_candidates.size(); + if (len > ntop_inner){ + std::partial_sort(temp_candidates.begin(), + temp_candidates.begin()+ntop_inner, + temp_candidates.end(), + candidate_cmp); + len = ntop_inner; + } else { + std::sort(temp_candidates.begin(), + temp_candidates.end(), candidate_cmp); + } + + + temp_candidates.resize(len); + real_candidates[i] = temp_candidates; + + temp_candidates.clear(); + +} + +} + +void sparse_dot_minmax_topn_parallel(int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], //data of A + int Bp[], + int Bj[], + double Bx[], //data of B + int ntop, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int *minmax_ntop, + int n_jobs) +{ + + Cp[0] = 0; + + int split_amount = n_row / n_jobs; + + std::vector> split_row_vector(n_jobs); + + std::vector> real_candidates(n_row); + + std::vector *real_cand_pointer; + real_cand_pointer = &real_candidates[0]; + + std::vector split_minmax_ntop(n_jobs, 0); + + std::vector thread_list(n_jobs); + + + for (int job_nr = 0; job_nr < n_jobs; job_nr++) { + std::vector temp_vector(2, 0); + + int start_split = job_nr * split_amount; + int end_split = start_split + split_amount; + + if (job_nr == n_jobs -1) { + end_split = n_row; + } + + temp_vector[0] = start_split; + temp_vector[1] = end_split; + + split_row_vector[job_nr] = temp_vector; + + } + + + for (int job_nr = 0; job_nr < n_jobs; job_nr++) { + + + int start_row = split_row_vector[job_nr][0]; + int end_row = split_row_vector[job_nr][1]; + + + thread_list[job_nr] = std::thread (inner_sparse_minmax_function, start_row, + end_row, n_col, ntop, lower_bound, + Ap, Aj, Ax, Bp, Bj, Bx, + real_cand_pointer, + &split_minmax_ntop[job_nr]); + + } + + for (int job_nr = 0; job_nr < n_jobs; job_nr++) { + thread_list[job_nr].join(); + } + + int nnz = 0; + + for (int m = 0; m < n_row; m++) { + + std::vector cand = real_cand_pointer[m]; + + int can_len = (int)cand.size(); + + for(int can_nr=0; can_nr < can_len; can_nr++){ + Cj[nnz] = cand[can_nr].index; + Cx[nnz] = cand[can_nr].value; + nnz++; + } + + Cp[m+1] = nnz; + + } + *minmax_ntop = *std::max_element(split_minmax_ntop.begin(), split_minmax_ntop.end()); + +} diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.h b/sparse_dot_topn/sparse_dot_topn_parallel.h new file mode 100644 index 00000000..bd70b573 --- /dev/null +++ b/sparse_dot_topn/sparse_dot_topn_parallel.h @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: Zhe Sun, Ahmet Erdem +// April 20, 2017 + +#ifndef UTILS_CPPCLASS_H +#define UTILS_CPPCLASS_H + +extern void sparse_dot_topn_parallel(int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int ntop, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int n_jobs); + +extern void sparse_dot_minmax_topn_parallel(int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int ntop, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int* minmax_topn, + int n_jobs); + +#endif //UTILS_CPPCLASS_H diff --git a/sparse_dot_topn/sparse_dot_topn_source.cpp b/sparse_dot_topn/sparse_dot_topn_source.cpp new file mode 100644 index 00000000..e5cc3e12 --- /dev/null +++ b/sparse_dot_topn/sparse_dot_topn_source.cpp @@ -0,0 +1,243 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: Zhe Sun, Ahmet Erdem +// April 20, 2017 + +#include +#include +#include + +#include "./sparse_dot_topn_source.h" + +bool candidate_cmp(candidate c_i, candidate c_j) { return (c_i.value > c_j.value); } + +/* + C++ implementation of sparse_dot_topn + + This function will return a matrxi C in CSR format, where + C = [sorted top n results > lower_bound for each row of A * B] + + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix + + Ap, Aj, Ax: CSR expression of A matrix + Bp, Bj, Bx: CSR expression of B matrix + + ntop: n top results + lower_bound: a threshold that the element of A*B must greater than + + Output by reference: + Cp, Cj, Cx: CSR expression of C matrix + + N.B. A and B must be CSR format!!! +*/ +void sparse_dot_topn_source(int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], //data of A + int Bp[], + int Bj[], + double Bx[], //data of B + int ntop, + double lower_bound, + int Cp[], + int Cj[], + double Cx[]) +{ + std::vector next(n_col,-1); + std::vector sums(n_col, 0); + + std::vector candidates; + + int nnz = 0; + + Cp[0] = 0; + + for(int i = 0; i < n_row; i++){ + int head = -2; + int length = 0; + + int jj_start = Ap[i]; + int jj_end = Ap[i+1]; + for(int jj = jj_start; jj < jj_end; jj++){ + int j = Aj[jj]; + double v = Ax[jj]; //value of A in (i,j) + + int kk_start = Bp[j]; + int kk_end = Bp[j+1]; + for(int kk = kk_start; kk < kk_end; kk++){ + int k = Bj[kk]; //kth column of B in row j + + sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i + + if(next[k] == -1){ + next[k] = head; //keep a linked list, every element points to the next column index + head = k; + length++; + } + } + } + + for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) + + if(sums[head] > lower_bound){ //append the nonzero elements + candidate c; + c.index = head; + c.value = sums[head]; + candidates.push_back(c); + } + + int temp = head; + head = next[head]; //iterate over columns + + next[temp] = -1; //clear arrays + sums[temp] = 0; //clear arrays + } + + int len = (int)candidates.size(); + if (len > ntop){ + std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp); + len = ntop; + } else { + std::sort(candidates.begin(), candidates.end(), candidate_cmp); + } + + for(int a=0; a < len; a++){ + Cj[nnz] = candidates[a].index; + Cx[nnz] = candidates[a].value; + nnz++; + } + candidates.clear(); + + Cp[i+1] = nnz; + } +} + +/* + C++ implementation of sparse_dot_minmax_topn + + This function will return a matrix C in CSR format, where + C = [sorted top n results > lower_bound for each row of A * B]. + It also returns minmax_ntop (the maximum number of columns set + for each row of A * B when ntop is infinite) + + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix + + Ap, Aj, Ax: CSR expression of A matrix + Bp, Bj, Bx: CSR expression of B matrix + + ntop: n top results + lower_bound: a threshold that the element of A*B must greater than + + Output by reference: + Cp, Cj, Cx: CSR expression of C matrix + minmax_ntop: the maximum number of columns set for each row of + A * B when ntop is infinite + + N.B. A and B must be CSR format!!! +*/ +void sparse_dot_minmax_topn_source(int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], //data of A + int Bp[], + int Bj[], + double Bx[], //data of B + int ntop, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int *minmax_ntop) +{ + std::vector next(n_col,-1); + std::vector sums(n_col, 0); + + std::vector candidates; + + int nnz = 0; + + Cp[0] = 0; + + *minmax_ntop = 0; + + for(int i = 0; i < n_row; i++){ + int head = -2; + int length = 0; + + int jj_start = Ap[i]; + int jj_end = Ap[i+1]; + for(int jj = jj_start; jj < jj_end; jj++){ + int j = Aj[jj]; + double v = Ax[jj]; //value of A in (i,j) + + int kk_start = Bp[j]; + int kk_end = Bp[j+1]; + for(int kk = kk_start; kk < kk_end; kk++){ + int k = Bj[kk]; //kth column of B in row j + + sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i + + if(next[k] == -1){ + next[k] = head; //keep a linked list, every element points to the next column index + head = k; + length++; + } + } + } + *minmax_ntop = (length > *minmax_ntop)? length : *minmax_ntop; + + for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) + + if(sums[head] > lower_bound){ //append the nonzero elements + candidate c; + c.index = head; + c.value = sums[head]; + candidates.push_back(c); + } + + int temp = head; + head = next[head]; //iterate over columns + + next[temp] = -1; //clear arrays + sums[temp] = 0; //clear arrays + } + + int len = (int)candidates.size(); + if (len > ntop){ + std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp); + len = ntop; + } else { + std::sort(candidates.begin(), candidates.end(), candidate_cmp); + } + + for(int a=0; a < len; a++){ + Cj[nnz] = candidates[a].index; + Cx[nnz] = candidates[a].value; + nnz++; + } + candidates.clear(); + + Cp[i+1] = nnz; + } +} diff --git a/sparse_dot_topn/sparse_dot_topn_source.h b/sparse_dot_topn/sparse_dot_topn_source.h new file mode 100644 index 00000000..d51de107 --- /dev/null +++ b/sparse_dot_topn/sparse_dot_topn_source.h @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: Zhe Sun, Ahmet Erdem +// April 20, 2017 + +#ifndef UTILS_CPPCLASS_H +#define UTILS_CPPCLASS_H + +struct candidate {int index; double value;}; + +extern bool candidate_cmp(candidate c_i, candidate c_j); + +extern void sparse_dot_topn_source(int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int ntop, + double lower_bound, + int Cp[], + int Cj[], + double Cx[]); + +extern void sparse_dot_minmax_topn_source(int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int ntop, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int *minmax_topn); + +#endif //UTILS_CPPCLASS_H diff --git a/sparse_dot_topn/sparse_dot_topn_threaded.pyx b/sparse_dot_topn/sparse_dot_topn_threaded.pyx new file mode 100644 index 00000000..1cef2229 --- /dev/null +++ b/sparse_dot_topn/sparse_dot_topn_threaded.pyx @@ -0,0 +1,120 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at# +# http://www.apache.org/licenses/LICENSE-2.0# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Zhe Sun, Ahmet Erdem +# April 20, 2017 + +# distutils: language = c++ + +import numpy as np +cimport numpy as np + +cdef extern from "sparse_dot_topn_parallel.h": + + cdef void sparse_dot_topn_parallel( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int topn, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int n_jobs); + + cdef void sparse_dot_minmax_topn_parallel( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int topn, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int minmax_ntop[], + int n_jobs); + +cpdef sparse_dot_topn_threaded( + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr, + np.ndarray[int, ndim=1] c_indices, + np.ndarray[double, ndim=1] c_data, + int n_jobs + ): + + cdef int* Ap = &a_indptr[0] + cdef int* Aj = &a_indices[0] + cdef double* Ax = &a_data[0] + cdef int* Bp = &b_indptr[0] + cdef int* Bj = &b_indices[0] + cdef double* Bx = &b_data[0] + cdef int* Cp = &c_indptr[0] + cdef int* Cj = &c_indices[0] + cdef double* Cx = &c_data[0] + + sparse_dot_topn_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, + lower_bound, Cp, Cj, Cx, n_jobs) + return + +cpdef sparse_dot_minmax_topn_threaded( + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr, + np.ndarray[int, ndim=1] c_indices, + np.ndarray[double, ndim=1] c_data, + np.ndarray[int, ndim=1] minmax_ntop, + int n_jobs + ): + + cdef int* Ap = &a_indptr[0] + cdef int* Aj = &a_indices[0] + cdef double* Ax = &a_data[0] + cdef int* Bp = &b_indptr[0] + cdef int* Bj = &b_indices[0] + cdef double* Bx = &b_data[0] + cdef int* Cp = &c_indptr[0] + cdef int* Cj = &c_indices[0] + cdef double* Cx = &c_data[0] + cdef int* o_minmax_ntop = &minmax_ntop[0] + + sparse_dot_minmax_topn_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, + lower_bound, Cp, Cj, Cx, o_minmax_ntop, n_jobs) + return diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index 21ead95d..73689cf5 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -6,8 +6,9 @@ from scipy.sparse.csr import csr_matrix from scipy.sparse.csgraph import connected_components from typing import Tuple, NamedTuple, List, Optional, Union -from sparse_dot_topn import awesome_cossim_topn +from sparse_dot_topn import awesome_cossim_minmax_topn from functools import wraps +import time DEFAULT_NGRAM_SIZE: int = 3 DEFAULT_REGEX: str = r'[,-./]|\s' @@ -247,13 +248,15 @@ def n_grams(self, string: str) -> List[str]: def fit(self) -> 'StringGrouper': """Builds the _matches list which contains string matches indices and similarity""" master_matrix, duplicate_matrix = self._get_tf_idf_matrices() + # Calculate the matches using the cosine similarity - matches = self._build_matches(master_matrix, duplicate_matrix) - # retrieve all matches - self._matches_list = self._get_matches_list(matches) + self._true_max_n_matches, matches = self._build_matches(master_matrix, duplicate_matrix) if self._duplicates is None: # the list of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A) - self._symmetrize_matches_list() + matches = StringGrouper._symmetrize_matrix(matches) + + # build list from matrix + self._matches_list = self._get_matches_list(matches) self.is_build = True return self @@ -434,7 +437,10 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix # if min_similarity <= 0 compute the true maximum number of matches over all strings in master: if self._config.min_similarity <= 0: + tic = time.perf_counter() self._true_max_n_matches = StringGrouper._get_true_max_n_matches(tf_idf_matrix_1, tf_idf_matrix_2) + toc = time.perf_counter() + print(f"1. _true_max_n_matches = {self._true_max_n_matches}; time: {toc - tic:0.4f} seconds", flush=True) if self._config.max_n_matches is None: self._max_n_matches = self._true_max_n_matches @@ -444,23 +450,14 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix 'use_threads': True, 'n_jobs': self._config.number_of_processes } - - return awesome_cossim_topn(tf_idf_matrix_1, tf_idf_matrix_2, + tic = time.perf_counter() + tup = awesome_cossim_minmax_topn(tf_idf_matrix_1, tf_idf_matrix_2, self._max_n_matches, self._config.min_similarity, **optional_kwargs) - - def _symmetrize_matches_list(self): - # [symmetrized matches_list] = [matches_list] UNION [transposed matches_list] (i.e., column-names swapped): - self._matches_list = self._matches_list.set_index(['master_side', 'dupe_side'])\ - .combine_first( - self._matches_list.rename( - columns={ - 'master_side': 'dupe_side', - 'dupe_side': 'master_side' - } - ).set_index(['master_side', 'dupe_side']) - ).reset_index() + toc = time.perf_counter() + print(f"2. _true_max_n_matches = {tup[0]}; time: {toc - tic:0.4f} seconds", flush=True) + return tup def _get_non_matches_list(self) -> pd.DataFrame: """Returns a list of all the indices of non-matching pairs (with similarity set to 0)""" @@ -479,6 +476,13 @@ def _get_non_matches_list(self) -> pd.DataFrame: missing_pairs['similarity'] = 0 return missing_pairs + @staticmethod + def _symmetrize_matrix(AA: csr_matrix) -> csr_matrix: + A = AA.tolil() + r, c = A.nonzero() + A[c, r] = A[r, c] + return A.tocsr() + @staticmethod def _get_true_max_n_matches(AA: csr_matrix, BB: csr_matrix) -> int: """Returns the true maximum number of matches over all strings in master""" @@ -496,25 +500,12 @@ def get_n_matches(i: int) -> int: return np.amax(v(range(M))) @staticmethod - def _get_matches_list(matches) -> pd.DataFrame: + def _get_matches_list(matches: csr_matrix) -> pd.DataFrame: """Returns a list of all the indices of matches""" - non_zeros = matches.nonzero() - - sparserows = non_zeros[0] - sparsecols = non_zeros[1] - nr_matches = sparsecols.size - master_side = np.empty([nr_matches], dtype=int) - dupe_side = np.empty([nr_matches], dtype=int) - similarity = np.zeros(nr_matches) - - for index in range(0, nr_matches): - master_side[index] = sparserows[index] - dupe_side[index] = sparsecols[index] - similarity[index] = matches.data[index] - - matches_list = pd.DataFrame({'master_side': master_side, - 'dupe_side': dupe_side, - 'similarity': similarity}) + r, c = matches.nonzero() + matches_list = pd.DataFrame({'master_side': r.astype(np.int64), + 'dupe_side': c.astype(np.int64), + 'similarity': matches.data}) return matches_list def _get_nearest_matches(self, diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py index cbc4bcae..452273ac 100644 --- a/string_grouper/test/test_string_grouper.py +++ b/string_grouper/test/test_string_grouper.py @@ -9,7 +9,10 @@ match_most_similar, group_similar_strings, match_strings,\ compute_pairwise_similarities from unittest.mock import patch +from scipy.sparse.csgraph._flow import csr_matrix +def mock_symmetrize_matrix(A: csr_matrix) -> csr_matrix: + return A class SimpleExample(object): def __init__(self): @@ -196,14 +199,14 @@ def test_match_strings(self, mock_StringGouper): mock_StringGrouper_instance.get_matches.assert_called_once() self.assertEqual(df, 'whatever') - @patch('string_grouper.string_grouper.StringGrouper._symmetrize_matches_list') - def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_matches_list): + @patch('string_grouper.string_grouper.StringGrouper._symmetrize_matrix', side_effect=mock_symmetrize_matrix) + def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_matrix): """mocks StringGrouper._symmetrize_matches_list so that this test fails whenever _matches_list is **partially** symmetric which often occurs when the kwarg max_n_matches is too small""" simple_example = SimpleExample() df = simple_example.customers_df2['Customer Name'] sg = StringGrouper(df, max_n_matches=2).fit() - mock_symmetrize_matches_list.assert_called_once() + mock_symmetrize_matrix.assert_called_once() # obtain the upper and lower triangular parts of the matrix of matches: upper = sg._matches_list[sg._matches_list['master_side'] < sg._matches_list['dupe_side']] lower = sg._matches_list[sg._matches_list['master_side'] > sg._matches_list['dupe_side']] @@ -333,7 +336,7 @@ def test_build_matches(self): expected_matches = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 0.]]) - np.testing.assert_array_equal(expected_matches, sg._build_matches(master, dupe).toarray()) + np.testing.assert_array_equal(expected_matches, sg._build_matches(master, dupe)[1].toarray()) def test_build_matches_list(self): """Should create the cosine similarity matrix of two series""" From 68a51a1c84da823c95e03f034eb0fa76417ba654 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Wed, 14 Apr 2021 18:02:45 +0200 Subject: [PATCH 04/29] made significant performance enhancements: 1. _symmetrize_matrix instead of _symmetrize_matches_list (boost x5) 2. _get_matches_list (boost x33) 3. awesome_cossim_true_minmax_topn_only (boost x43) --- sparse_dot_topn/.gitignore | 4 +- sparse_dot_topn/__init__.py | 4 +- sparse_dot_topn/awesome_cossim_minmax_topn.py | 92 ------------------- sparse_dot_topn/awesome_cossim_topn.py | 62 +++++++++++++ sparse_dot_topn/sparse_dot_topn.pyx | 65 +++++++++++-- sparse_dot_topn/sparse_dot_topn_parallel.cpp | 86 ++++++++++++++++- sparse_dot_topn/sparse_dot_topn_parallel.h | 13 ++- sparse_dot_topn/sparse_dot_topn_source.cpp | 68 ++++++++++++-- sparse_dot_topn/sparse_dot_topn_source.h | 64 +++++++------ sparse_dot_topn/sparse_dot_topn_threaded.pyx | 38 +++++++- string_grouper/string_grouper.py | 60 +++++------- string_grouper/test/test_string_grouper.py | 2 +- 12 files changed, 376 insertions(+), 182 deletions(-) delete mode 100644 sparse_dot_topn/awesome_cossim_minmax_topn.py diff --git a/sparse_dot_topn/.gitignore b/sparse_dot_topn/.gitignore index d40e00a1..97caf501 100644 --- a/sparse_dot_topn/.gitignore +++ b/sparse_dot_topn/.gitignore @@ -1,4 +1,4 @@ +/sparse_dot_topn_threaded.cpp /sparse_dot_topn.cp39-win_amd64.pyd -/sparse_dot_topn_threaded.cp39-win_amd64.pyd /sparse_dot_topn.cpp -/sparse_dot_topn_threaded.cpp +/sparse_dot_topn_threaded.cp39-win_amd64.pyd diff --git a/sparse_dot_topn/__init__.py b/sparse_dot_topn/__init__.py index 09d2bfa7..9cfee892 100644 --- a/sparse_dot_topn/__init__.py +++ b/sparse_dot_topn/__init__.py @@ -2,6 +2,6 @@ import sys if sys.version_info[0] >= 3: - from sparse_dot_topn.awesome_cossim_minmax_topn import awesome_cossim_minmax_topn + from sparse_dot_topn.awesome_cossim_topn import awesome_cossim_topn, awesome_cossim_true_minmax_topn_only else: - from awesome_cossim_minmax_topn import awesome_cossim_minmax_topn \ No newline at end of file + from awesome_cossim_topn import awesome_cossim_topn, awesome_cossim_true_minmax_topn_only \ No newline at end of file diff --git a/sparse_dot_topn/awesome_cossim_minmax_topn.py b/sparse_dot_topn/awesome_cossim_minmax_topn.py deleted file mode 100644 index 92fdf87f..00000000 --- a/sparse_dot_topn/awesome_cossim_minmax_topn.py +++ /dev/null @@ -1,92 +0,0 @@ -import sys -import numpy as np -from scipy.sparse import csr_matrix -from scipy.sparse import isspmatrix_csr - -if sys.version_info[0] >= 3: - from sparse_dot_topn import sparse_dot_topn as ct - from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread -# else: - # import sparse_dot_topn as ct - # import sparse_dot_topn_threaded as ct_thread - - -def awesome_cossim_minmax_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1): - """ - This function will return a matrxi C in CSR format, where - C = [sorted top n results and results > lower_bound for each row of A * B] - - Input: - A and B: two CSR matrix - ntop: n top results - lower_bound: a threshold that the element of A*B must greater than - use_threads: use multi-thread or not - n_jobs: number of thread, must be >= 1 - - Output: - C: result matrix - - N.B. if A and B are not CSR format, they will be converted to CSR - """ - if not isspmatrix_csr(A): - A = A.tocsr() - - if not isspmatrix_csr(B): - B = B.tocsr() - - M, K1 = A.shape - K2, N = B.shape - - if K1 != K2: - err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!' - raise ValueError(err_str) - - idx_dtype = np.int32 - - nnz_max = M*ntop - - # basic check. if A or B are all zeros matrix, return all zero matrix directly - if len(A.indices) == 0 or len(B.indices) == 0: - indptr = np.zeros(M + 1, dtype=idx_dtype) - indices = np.zeros(nnz_max, dtype=idx_dtype) - data = np.zeros(nnz_max, dtype=A.dtype) - return 0, csr_matrix((data, indices, indptr), shape=(M, N)) - - # filled matrices from here on - indptr = np.empty(M+1, dtype=idx_dtype) - indices = np.empty(nnz_max, dtype=idx_dtype) - data = np.empty(nnz_max, dtype=A.dtype) - - minmax_topn = np.full(1, 0, dtype=idx_dtype) - - if not use_threads: - - ct.sparse_dot_minmax_topn( - M, N, np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - A.data, - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - B.data, - ntop, - lower_bound, - indptr, indices, data, - minmax_topn) - - else: - if n_jobs < 1: - err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!' - raise ValueError(err_str) - - ct_thread.sparse_dot_minmax_topn_threaded( - M, N, np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - A.data, - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - B.data, - ntop, - lower_bound, - indptr, indices, data, minmax_topn, n_jobs) - - return minmax_topn[0], csr_matrix((data, indices, indptr), shape=(M, N)) diff --git a/sparse_dot_topn/awesome_cossim_topn.py b/sparse_dot_topn/awesome_cossim_topn.py index c4af03d4..b93298de 100644 --- a/sparse_dot_topn/awesome_cossim_topn.py +++ b/sparse_dot_topn/awesome_cossim_topn.py @@ -87,3 +87,65 @@ def awesome_cossim_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1): indptr, indices, data, n_jobs) return csr_matrix((data, indices, indptr), shape=(M, N)) + +def awesome_cossim_true_minmax_topn_only(A, B, use_threads=False, n_jobs=1): + """ + This function will return the maximum number of columns set + per row over all rows of A * B + + Input: + A and B: two CSR matrix + use_threads: use multi-thread or not + n_jobs: number of thread, must be >= 1 + + Output: + minmax_topn: maximum number of columns set + per row over all rows of A * B + + N.B. if A and B are not CSR format, they will be converted to CSR + """ + if not isspmatrix_csr(A): + A = A.tocsr() + + if not isspmatrix_csr(B): + B = B.tocsr() + + M, K1 = A.shape + K2, N = B.shape + + if K1 != K2: + err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!' + raise ValueError(err_str) + + idx_dtype = np.int32 + + minmax_topn = np.full(1, 0, dtype=idx_dtype) + + # basic check. if A or B are all zeros matrix, return all zero matrix directly + if len(A.indices) == 0 or len(B.indices) == 0: + return 0 + + if not use_threads: + + ct.sparse_dot_only_minmax_topn( + M, N, + np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + minmax_topn) + + else: + if n_jobs < 1: + err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!' + raise ValueError(err_str) + + ct_thread.sparse_dot_only_minmax_topn_threaded( + M, N, + np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + minmax_topn, n_jobs) + + return minmax_topn[0] diff --git a/sparse_dot_topn/sparse_dot_topn.pyx b/sparse_dot_topn/sparse_dot_topn.pyx index 1da3181a..54771132 100644 --- a/sparse_dot_topn/sparse_dot_topn.pyx +++ b/sparse_dot_topn/sparse_dot_topn.pyx @@ -14,6 +14,8 @@ # Author: Zhe Sun, Ahmet Erdem # April 20, 2017 +# Modified by: Particular Miner +# April 14, 2021 # distutils: language = c++ @@ -37,7 +39,7 @@ cdef extern from "sparse_dot_topn_source.h": int Cj[], double Cx[]); - cdef void sparse_dot_minmax_topn_source( + cdef void sparse_dot_plus_minmax_topn_source( int n_row, int n_col, int Ap[], @@ -53,6 +55,15 @@ cdef extern from "sparse_dot_topn_source.h": double Cx[], int minmax_topn[]); + cdef void sparse_dot_only_minmax_topn_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + int Bp[], + int Bj[], + int minmax_topn[]); + cpdef sparse_dot_topn( int n_row, int n_col, @@ -70,7 +81,7 @@ cpdef sparse_dot_topn( ): """ Cython glue function to call sparse_dot_topn C++ implementation - This function will return a matrxi C in CSR format, where + This function will return a matrix C in CSR format, where C = [sorted top n results and results > lower_bound for each row of A * B] Input: @@ -103,7 +114,7 @@ cpdef sparse_dot_topn( sparse_dot_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx) return -cpdef sparse_dot_minmax_topn( +cpdef sparse_dot_plus_minmax_topn( int n_row, int n_col, np.ndarray[int, ndim=1] a_indptr, @@ -120,11 +131,11 @@ cpdef sparse_dot_minmax_topn( np.ndarray[int, ndim=1] minmax_topn ): """ - Cython glue function to call sparse_dot_minmax_topn C++ implementation + Cython glue function to call sparse_dot_plus_minmax_topn C++ implementation This function will return a matrix C in CSR format, where C = [sorted top n results > lower_bound for each row of A * B]. It also returns minmax_ntop (the maximum number of columns set - for each row of A * B when ntop is infinite) + per row over all rows of A * B assuming ntop is infinite) Input: n_row: number of rows of A matrix @@ -138,8 +149,8 @@ cpdef sparse_dot_minmax_topn( Output by reference: c_indptr, c_indices, c_data: CSR expression of C matrix - minmax_ntop: the maximum number of columns set for each row of - A * B when ntop is infinite + minmax_ntop: the maximum number of columns set per row over all rows of + A * B assuming ntop is infinite N.B. A and B must be CSR format!!! The type of input numpy array must be aligned with types of C++ function aguments! @@ -156,5 +167,43 @@ cpdef sparse_dot_minmax_topn( cdef double* Cx = &c_data[0] cdef int* o_minmax_topn = &minmax_topn[0] - sparse_dot_minmax_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, o_minmax_topn) + sparse_dot_plus_minmax_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, o_minmax_topn) + return + +cpdef sparse_dot_only_minmax_topn( + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[int, ndim=1] minmax_topn + ): + """ + Cython glue function to call sparse_dot_only_minmax_topn C++ implementation + This function will return the maximum number of columns set + per row over all rows of A * B + + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix + + a_indptr, a_indices: CSR indices of A matrix + b_indptr, b_indices: CSR indices of B matrix + + Output by reference: + minmax_ntop: the maximum number of columns set per row over all rows of + A * B + + N.B. A and B must be CSR format!!! + The type of input numpy array must be aligned with types of C++ function arguments! + """ + + cdef int* Ap = &a_indptr[0] + cdef int* Aj = &a_indices[0] + cdef int* Bp = &b_indptr[0] + cdef int* Bj = &b_indices[0] + cdef int* o_minmax_topn = &minmax_topn[0] + + sparse_dot_only_minmax_topn_source(n_row, n_col, Ap, Aj, Bp, Bj, o_minmax_topn) return \ No newline at end of file diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/sparse_dot_topn/sparse_dot_topn_parallel.cpp index dc123b80..c2b9a0b9 100644 --- a/sparse_dot_topn/sparse_dot_topn_parallel.cpp +++ b/sparse_dot_topn/sparse_dot_topn_parallel.cpp @@ -17,11 +17,14 @@ // Author: Zhe Sun, Ahmet Erdem // April 20, 2017 +// Modified by: Particular Miner +// April 14, 2021 #include #include #include #include +#include #include "./sparse_dot_topn_source.h" #include "./sparse_dot_topn_parallel.h" @@ -274,7 +277,7 @@ for(int i = start_row; i < end_row; i++){ } -void sparse_dot_minmax_topn_parallel(int n_row, +void sparse_dot_plus_minmax_topn_parallel(int n_row, int n_col, int Ap[], int Aj[], @@ -364,3 +367,84 @@ void sparse_dot_minmax_topn_parallel(int n_row, *minmax_ntop = *std::max_element(split_minmax_ntop.begin(), split_minmax_ntop.end()); } + +void inner_sparse_only_minmax_function(int start_row, int end_row, int n_col_inner, + int Ap_copy[], int Aj_copy[], + int Bp_copy[], int Bj_copy[], + int *minmax_ntop) +{ + std::vector unmarked(n_col_inner, true); + + for(int i = start_row; i < end_row; i++){ + + int length = 0; + + int jj_start = Ap_copy[i]; + int jj_end = Ap_copy[i+1]; + + for(int jj = jj_start; jj < jj_end; jj++){ + int j = Aj_copy[jj]; + + int kk_start = Bp_copy[j]; + int kk_end = Bp_copy[j+1]; + for(int kk = kk_start; kk < kk_end; kk++){ + int k = Bj_copy[kk]; //kth column of B in row j + + if(unmarked[k]){ // if this k is not already marked then ... + unmarked[k] = false; // keep a record of column k + length++; + } + } + } + *minmax_ntop = (length > *minmax_ntop)? length : *minmax_ntop; + } +} + +void sparse_dot_only_minmax_topn_parallel(int n_row, + int n_col, + int Ap[], + int Aj[], + int Bp[], + int Bj[], + int *minmax_ntop, + int n_jobs) +{ + std::vector job_load_sz(n_jobs, n_row/n_jobs); + + int rem = n_row % n_jobs; + for (int r = 0; r < rem; r++) job_load_sz[r] += 1; + + std::vector> split_row_vector(n_jobs); + + std::vector split_minmax_ntop(n_jobs, 0); + + std::vector thread_list(n_jobs); + + int start = 0; + for (int job_nr = 0; job_nr < n_jobs; job_nr++) { + std::vector temp_vector(2, 0); + + temp_vector[0] = start; + temp_vector[1] = start + job_load_sz[job_nr]; + start = temp_vector[1]; + + split_row_vector[job_nr] = temp_vector; + } + + for (int job_nr = 0; job_nr < n_jobs; job_nr++) { + + + int start_row = split_row_vector[job_nr][0]; + int end_row = split_row_vector[job_nr][1]; + + thread_list[job_nr] = std::thread (inner_sparse_only_minmax_function, + start_row, end_row, n_col, + Ap, Aj, Bp, Bj, + &split_minmax_ntop[job_nr]); + + } + + for (int job_nr = 0; job_nr < n_jobs; job_nr++) thread_list[job_nr].join(); + + *minmax_ntop = *std::max_element(split_minmax_ntop.begin(), split_minmax_ntop.end()); +} diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.h b/sparse_dot_topn/sparse_dot_topn_parallel.h index bd70b573..cb43cd1c 100644 --- a/sparse_dot_topn/sparse_dot_topn_parallel.h +++ b/sparse_dot_topn/sparse_dot_topn_parallel.h @@ -17,6 +17,8 @@ // Author: Zhe Sun, Ahmet Erdem // April 20, 2017 +// Modified by: Particular Miner +// April 14, 2021 #ifndef UTILS_CPPCLASS_H #define UTILS_CPPCLASS_H @@ -36,7 +38,7 @@ extern void sparse_dot_topn_parallel(int n_row, double Cx[], int n_jobs); -extern void sparse_dot_minmax_topn_parallel(int n_row, +extern void sparse_dot_plus_minmax_topn_parallel(int n_row, int n_col, int Ap[], int Aj[], @@ -52,4 +54,13 @@ extern void sparse_dot_minmax_topn_parallel(int n_row, int* minmax_topn, int n_jobs); +extern void sparse_dot_only_minmax_topn_parallel(int n_row, + int n_col, + int Ap[], + int Aj[], + int Bp[], + int Bj[], + int *minmax_ntop, + int n_jobs); + #endif //UTILS_CPPCLASS_H diff --git a/sparse_dot_topn/sparse_dot_topn_source.cpp b/sparse_dot_topn/sparse_dot_topn_source.cpp index e5cc3e12..dcf99637 100644 --- a/sparse_dot_topn/sparse_dot_topn_source.cpp +++ b/sparse_dot_topn/sparse_dot_topn_source.cpp @@ -17,6 +17,8 @@ // Author: Zhe Sun, Ahmet Erdem // April 20, 2017 +// Modified by: Particular Miner +// April 14, 2021 #include #include @@ -29,7 +31,7 @@ bool candidate_cmp(candidate c_i, candidate c_j) { return (c_i.value > c_j.value /* C++ implementation of sparse_dot_topn - This function will return a matrxi C in CSR format, where + This function will return a matrix C in CSR format, where C = [sorted top n results > lower_bound for each row of A * B] Input: @@ -131,12 +133,12 @@ void sparse_dot_topn_source(int n_row, } /* - C++ implementation of sparse_dot_minmax_topn + C++ implementation of sparse_dot_plus_minmax_topn_source This function will return a matrix C in CSR format, where C = [sorted top n results > lower_bound for each row of A * B]. It also returns minmax_ntop (the maximum number of columns set - for each row of A * B when ntop is infinite) + per row over all rows of A * B assuming ntop is infinite) Input: n_row: number of rows of A matrix @@ -150,12 +152,12 @@ void sparse_dot_topn_source(int n_row, Output by reference: Cp, Cj, Cx: CSR expression of C matrix - minmax_ntop: the maximum number of columns set for each row of - A * B when ntop is infinite + minmax_ntop: the maximum number of columns set per row over all + rows of A * B assuming ntop is infinite N.B. A and B must be CSR format!!! */ -void sparse_dot_minmax_topn_source(int n_row, +void sparse_dot_plus_minmax_topn_source(int n_row, int n_col, int Ap[], int Aj[], @@ -241,3 +243,57 @@ void sparse_dot_minmax_topn_source(int n_row, Cp[i+1] = nnz; } } + +/* + C++ implementation of sparse_dot_only_minmax_topn_source + + This function will return the maximum number of columns set + per row over all rows of A * B + + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix + + Ap, Aj, Ax: CSR expression of A matrix + Bp, Bj, Bx: CSR expression of B matrix + + Output by reference: + minmax_ntop: the maximum number of columns set per row + over all rows of A * B + + N.B. A and B must be CSR format!!! +*/ +void sparse_dot_only_minmax_topn_source(int n_row, + int n_col, + int Ap[], + int Aj[], + int Bp[], + int Bj[], + int *minmax_ntop) +{ + std::vector unmarked(n_col, true); + + *minmax_ntop = 0; + + for(int i = 0; i < n_row; i++){ + int length = 0; + + int jj_start = Ap[i]; + int jj_end = Ap[i+1]; + for(int jj = jj_start; jj < jj_end; jj++){ + int j = Aj[jj]; + + int kk_start = Bp[j]; + int kk_end = Bp[j+1]; + for(int kk = kk_start; kk < kk_end; kk++){ + int k = Bj[kk]; // kth column of B in row j + + if(unmarked[k]){ // if this k is not already marked then ... + unmarked[k] = false; // keep a record of column k + length++; + } + } + } + *minmax_ntop = (length > *minmax_ntop)? length : *minmax_ntop; + } +} diff --git a/sparse_dot_topn/sparse_dot_topn_source.h b/sparse_dot_topn/sparse_dot_topn_source.h index d51de107..6143eb93 100644 --- a/sparse_dot_topn/sparse_dot_topn_source.h +++ b/sparse_dot_topn/sparse_dot_topn_source.h @@ -17,6 +17,8 @@ // Author: Zhe Sun, Ahmet Erdem // April 20, 2017 +// Modified by: Particular Miner +// April 14, 2021 #ifndef UTILS_CPPCLASS_H #define UTILS_CPPCLASS_H @@ -26,32 +28,40 @@ struct candidate {int index; double value;}; extern bool candidate_cmp(candidate c_i, candidate c_j); extern void sparse_dot_topn_source(int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int ntop, - double lower_bound, - int Cp[], - int Cj[], - double Cx[]); - -extern void sparse_dot_minmax_topn_source(int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int ntop, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], - int *minmax_topn); + int n_col, + int Ap[], + int Aj[], + double Ax[], //data of A + int Bp[], + int Bj[], + double Bx[], //data of B + int ntop, + double lower_bound, + int Cp[], + int Cj[], + double Cx[]); //data of C + +extern void sparse_dot_plus_minmax_topn_source(int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], //data of A + int Bp[], + int Bj[], + double Bx[], //data of B + int ntop, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], //data of C + int *minmax_topn); + +extern void sparse_dot_only_minmax_topn_source(int n_row, + int n_col, + int Ap[], + int Aj[], + int Bp[], + int Bj[], + int *minmax_ntop); #endif //UTILS_CPPCLASS_H diff --git a/sparse_dot_topn/sparse_dot_topn_threaded.pyx b/sparse_dot_topn/sparse_dot_topn_threaded.pyx index 1cef2229..0bb45a6a 100644 --- a/sparse_dot_topn/sparse_dot_topn_threaded.pyx +++ b/sparse_dot_topn/sparse_dot_topn_threaded.pyx @@ -14,6 +14,8 @@ # Author: Zhe Sun, Ahmet Erdem # April 20, 2017 +# Modified by: Particular Miner +# April 14, 2021 # distutils: language = c++ @@ -38,7 +40,7 @@ cdef extern from "sparse_dot_topn_parallel.h": double Cx[], int n_jobs); - cdef void sparse_dot_minmax_topn_parallel( + cdef void sparse_dot_plus_minmax_topn_parallel( int n_row, int n_col, int Ap[], @@ -55,6 +57,16 @@ cdef extern from "sparse_dot_topn_parallel.h": int minmax_ntop[], int n_jobs); + cdef void sparse_dot_only_minmax_topn_parallel( + int n_row, + int n_col, + int Ap[], + int Aj[], + int Bp[], + int Bj[], + int minmax_ntop[], + int n_jobs); + cpdef sparse_dot_topn_threaded( int n_row, int n_col, @@ -86,7 +98,7 @@ cpdef sparse_dot_topn_threaded( lower_bound, Cp, Cj, Cx, n_jobs) return -cpdef sparse_dot_minmax_topn_threaded( +cpdef sparse_dot_plus_minmax_topn_threaded( int n_row, int n_col, np.ndarray[int, ndim=1] a_indptr, @@ -115,6 +127,26 @@ cpdef sparse_dot_minmax_topn_threaded( cdef double* Cx = &c_data[0] cdef int* o_minmax_ntop = &minmax_ntop[0] - sparse_dot_minmax_topn_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, + sparse_dot_plus_minmax_topn_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, o_minmax_ntop, n_jobs) return + +cpdef sparse_dot_only_minmax_topn_threaded( + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[int, ndim=1] minmax_ntop, + int n_jobs + ): + + cdef int* Ap = &a_indptr[0] + cdef int* Aj = &a_indices[0] + cdef int* Bp = &b_indptr[0] + cdef int* Bj = &b_indices[0] + cdef int* o_minmax_ntop = &minmax_ntop[0] + + sparse_dot_only_minmax_topn_parallel(n_row, n_col, Ap, Aj, Bp, Bj, o_minmax_ntop, n_jobs) + return diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index 73689cf5..53fc3600 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -6,9 +6,8 @@ from scipy.sparse.csr import csr_matrix from scipy.sparse.csgraph import connected_components from typing import Tuple, NamedTuple, List, Optional, Union -from sparse_dot_topn import awesome_cossim_minmax_topn +from sparse_dot_topn import awesome_cossim_topn, awesome_cossim_true_minmax_topn_only from functools import wraps -import time DEFAULT_NGRAM_SIZE: int = 3 DEFAULT_REGEX: str = r'[,-./]|\s' @@ -248,13 +247,11 @@ def n_grams(self, string: str) -> List[str]: def fit(self) -> 'StringGrouper': """Builds the _matches list which contains string matches indices and similarity""" master_matrix, duplicate_matrix = self._get_tf_idf_matrices() - # Calculate the matches using the cosine similarity - self._true_max_n_matches, matches = self._build_matches(master_matrix, duplicate_matrix) + matches = self._build_matches(master_matrix, duplicate_matrix) if self._duplicates is None: - # the list of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A) + # the matrix of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A) matches = StringGrouper._symmetrize_matrix(matches) - # build list from matrix self._matches_list = self._get_matches_list(matches) self.is_build = True @@ -435,29 +432,30 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix tf_idf_matrix_1 = master_matrix tf_idf_matrix_2 = duplicate_matrix.transpose() - # if min_similarity <= 0 compute the true maximum number of matches over all strings in master: - if self._config.min_similarity <= 0: - tic = time.perf_counter() - self._true_max_n_matches = StringGrouper._get_true_max_n_matches(tf_idf_matrix_1, tf_idf_matrix_2) - toc = time.perf_counter() - print(f"1. _true_max_n_matches = {self._true_max_n_matches}; time: {toc - tic:0.4f} seconds", flush=True) - if self._config.max_n_matches is None: - self._max_n_matches = self._true_max_n_matches - optional_kwargs = dict() if self._config.number_of_processes > 1: optional_kwargs = { 'use_threads': True, 'n_jobs': self._config.number_of_processes } - tic = time.perf_counter() - tup = awesome_cossim_minmax_topn(tf_idf_matrix_1, tf_idf_matrix_2, - self._max_n_matches, - self._config.min_similarity, - **optional_kwargs) - toc = time.perf_counter() - print(f"2. _true_max_n_matches = {tup[0]}; time: {toc - tic:0.4f} seconds", flush=True) - return tup + + # if min_similarity <= 0 compute the true maximum number of matches over all strings in master: + if self._config.min_similarity <= 0: + self._true_max_n_matches = awesome_cossim_true_minmax_topn_only( + tf_idf_matrix_1, + tf_idf_matrix_2, + **optional_kwargs + ) + # if kwarg max_n_matches was not set then set it now to true value + if self._config.max_n_matches is None: + self._max_n_matches = self._true_max_n_matches + + return awesome_cossim_topn( + tf_idf_matrix_1, tf_idf_matrix_2, + self._max_n_matches, + self._config.min_similarity, + **optional_kwargs + ) def _get_non_matches_list(self) -> pd.DataFrame: """Returns a list of all the indices of non-matching pairs (with similarity set to 0)""" @@ -483,22 +481,6 @@ def _symmetrize_matrix(AA: csr_matrix) -> csr_matrix: A[c, r] = A[r, c] return A.tocsr() - @staticmethod - def _get_true_max_n_matches(AA: csr_matrix, BB: csr_matrix) -> int: - """Returns the true maximum number of matches over all strings in master""" - def get_n_matches(i: int) -> int: - a_cols = A.indices[A.indptr[i]:A.indptr[i+1]] - nz = np.full(N, 0, dtype=int) - for j in a_cols: - nz[B.indices[B.indptr[j]:B.indptr[j+1]]] = 1 - return np.sum(nz) - - A, B = AA.tocsr(), BB.tocsr() - M, _ = A.shape - _, N = B.shape - v = np.vectorize(get_n_matches) - return np.amax(v(range(M))) - @staticmethod def _get_matches_list(matches: csr_matrix) -> pd.DataFrame: """Returns a list of all the indices of matches""" diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py index 452273ac..64b8caf8 100644 --- a/string_grouper/test/test_string_grouper.py +++ b/string_grouper/test/test_string_grouper.py @@ -336,7 +336,7 @@ def test_build_matches(self): expected_matches = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 0.]]) - np.testing.assert_array_equal(expected_matches, sg._build_matches(master, dupe)[1].toarray()) + np.testing.assert_array_equal(expected_matches, sg._build_matches(master, dupe).toarray()) def test_build_matches_list(self): """Should create the cosine similarity matrix of two series""" From 798daf78fe9727be621b45d29db7b45652951f95 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Wed, 14 Apr 2021 19:17:51 +0200 Subject: [PATCH 05/29] updated setup.py --- setup.py | 63 +++++++++++++++++++--- sparse_dot_topn/__init__.py | 7 +-- sparse_dot_topn/awesome_cossim_topn.py | 9 +--- string_grouper/string_grouper.py | 24 ++++----- string_grouper/test/test_string_grouper.py | 12 ++--- 5 files changed, 77 insertions(+), 38 deletions(-) diff --git a/setup.py b/setup.py index f4b5ecb0..535aa5c7 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,6 @@ -from setuptools import setup +from setuptools import setup, Extension import pathlib +import os # The directory containing this file HERE = pathlib.Path(__file__).parent @@ -7,13 +8,53 @@ # The text of the README file README = (HERE / "README.md").read_text() +# workaround for numpy and Cython install dependency +# the solution is from https://stackoverflow.com/a/54138355 +def my_build_ext(pars): + # import delayed: + from setuptools.command.build_ext import build_ext as _build_ext + class build_ext(_build_ext): + def finalize_options(self): + _build_ext.finalize_options(self) + # Prevent numpy from thinking it is still in its setup process: + __builtins__.__NUMPY_SETUP__ = False + import numpy + self.include_dirs.append(numpy.get_include()) + + #object returned: + return build_ext(pars) + +if os.name == 'nt': + extra_compile_args = ["-Ox"] +else: + extra_compile_args = ['-std=c++0x', '-pthread', '-O3'] + +original_ext = Extension('sparse_dot_topn.sparse_dot_topn', + sources=['./sparse_dot_topn/sparse_dot_topn.pyx', + './sparse_dot_topn/sparse_dot_topn_source.cpp'], + extra_compile_args=extra_compile_args, + language='c++') + +threaded_ext = Extension('sparse_dot_topn.sparse_dot_topn_threaded', + sources=[ + './sparse_dot_topn/sparse_dot_topn_threaded.pyx', + './sparse_dot_topn/sparse_dot_topn_source.cpp', + './sparse_dot_topn/sparse_dot_topn_parallel.cpp'], + extra_compile_args=extra_compile_args, + language='c++') + setup( name='string_grouper', version='0.4.0', - packages=['string_grouper'], + packages=[ + 'string_grouper' + , 'string_grouper_utils' + , 'sparse_dot_topn' + ], license='MIT License', description='String grouper contains functions to do string matching using TF-IDF and the cossine similarity. ' 'Based on https://bergvca.github.io/2017/10/14/super-fast-string-matching.html', + keywords='cosine-similarity sparse-matrix sparse-graph scipy cython', author='Chris van den Berg', long_description=README, long_description_content_type="text/markdown", @@ -21,10 +62,20 @@ url='https://github.com/Bergvca/string_grouper', zip_safe=False, python_requires='>3.7', - install_requires=['pandas>=0.25.3' + setup_requires=[# Setuptools 18.0 properly handles Cython extensions. + 'setuptools>=18.0' + , 'cython>=0.29.15' + , 'numpy' + , 'scipy' + ], + install_requires=[# Setuptools 18.0 properly handles Cython extensions. + 'setuptools>=18.0' + , 'cython>=0.29.15' + , 'numpy' , 'scipy' , 'scikit-learn' - , 'numpy' - , 'sparse_dot_topn>=0.2.6' - ] + , 'pandas>=0.25.3' + ], + cmdclass={'build_ext': my_build_ext}, + ext_modules=[original_ext, threaded_ext] ) diff --git a/sparse_dot_topn/__init__.py b/sparse_dot_topn/__init__.py index 9cfee892..d7e882f0 100644 --- a/sparse_dot_topn/__init__.py +++ b/sparse_dot_topn/__init__.py @@ -1,7 +1,2 @@ # flake8: noqa -import sys - -if sys.version_info[0] >= 3: - from sparse_dot_topn.awesome_cossim_topn import awesome_cossim_topn, awesome_cossim_true_minmax_topn_only -else: - from awesome_cossim_topn import awesome_cossim_topn, awesome_cossim_true_minmax_topn_only \ No newline at end of file +from sparse_dot_topn.awesome_cossim_topn import awesome_cossim_topn, awesome_cossim_true_minmax_topn_only diff --git a/sparse_dot_topn/awesome_cossim_topn.py b/sparse_dot_topn/awesome_cossim_topn.py index b93298de..ee6c2ca2 100644 --- a/sparse_dot_topn/awesome_cossim_topn.py +++ b/sparse_dot_topn/awesome_cossim_topn.py @@ -1,14 +1,9 @@ -import sys import numpy as np from scipy.sparse import csr_matrix from scipy.sparse import isspmatrix_csr -if sys.version_info[0] >= 3: - from sparse_dot_topn import sparse_dot_topn as ct - from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread -else: - import sparse_dot_topn as ct - import sparse_dot_topn_threaded as ct_thread +from sparse_dot_topn import sparse_dot_topn as ct +from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread def awesome_cossim_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1): diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index 53fc3600..69ecd912 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -249,8 +249,8 @@ def fit(self) -> 'StringGrouper': master_matrix, duplicate_matrix = self._get_tf_idf_matrices() # Calculate the matches using the cosine similarity matches = self._build_matches(master_matrix, duplicate_matrix) - if self._duplicates is None: - # the matrix of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A) + if self._duplicates is None and self._max_n_matches < self._true_max_n_matches: + # the list of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A) matches = StringGrouper._symmetrize_matrix(matches) # build list from matrix self._matches_list = self._get_matches_list(matches) @@ -439,16 +439,16 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix 'n_jobs': self._config.number_of_processes } - # if min_similarity <= 0 compute the true maximum number of matches over all strings in master: - if self._config.min_similarity <= 0: - self._true_max_n_matches = awesome_cossim_true_minmax_topn_only( - tf_idf_matrix_1, - tf_idf_matrix_2, - **optional_kwargs - ) - # if kwarg max_n_matches was not set then set it now to true value - if self._config.max_n_matches is None: - self._max_n_matches = self._true_max_n_matches + # compute the true maximum number of matches over all strings in master: + self._true_max_n_matches = awesome_cossim_true_minmax_topn_only( + tf_idf_matrix_1, + tf_idf_matrix_2, + **optional_kwargs + ) + + if self._config.min_similarity <= 0 and self._config.max_n_matches is None: + # if kwarg max_n_matches was not set when min_similarity <= 0 then set it now to its true value + self._max_n_matches = self._true_max_n_matches return awesome_cossim_topn( tf_idf_matrix_1, tf_idf_matrix_2, diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py index 64b8caf8..c928bfa3 100644 --- a/string_grouper/test/test_string_grouper.py +++ b/string_grouper/test/test_string_grouper.py @@ -3,13 +3,11 @@ import numpy as np from scipy.sparse.csr import csr_matrix from string_grouper.string_grouper import DEFAULT_MIN_SIMILARITY, \ - DEFAULT_MAX_N_MATCHES, DEFAULT_REGEX, \ - DEFAULT_NGRAM_SIZE, DEFAULT_N_PROCESSES, DEFAULT_IGNORE_CASE, \ + DEFAULT_REGEX, DEFAULT_NGRAM_SIZE, DEFAULT_N_PROCESSES, DEFAULT_IGNORE_CASE, \ StringGrouperConfig, StringGrouper, StringGrouperNotFitException, \ - match_most_similar, group_similar_strings, match_strings,\ + match_most_similar, group_similar_strings, match_strings, \ compute_pairwise_similarities from unittest.mock import patch -from scipy.sparse.csgraph._flow import csr_matrix def mock_symmetrize_matrix(A: csr_matrix) -> csr_matrix: return A @@ -383,7 +381,7 @@ def test_get_matches_single(self): left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] left_index = [0, 0, 1, 2, 3, 3] - right_index = [0, 3, 1, 2, 0, 3] + right_index = [3, 0, 1, 2, 3, 0] similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'similarity': similarity, @@ -399,8 +397,8 @@ def test_get_matches_1_series_1_id_series(self): left_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3'] left_index = [0, 0, 1, 2, 3, 3] right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] - right_side_id = ['A0', 'A3', 'A1', 'A2', 'A0', 'A3'] - right_index = [0, 3, 1, 2, 0, 3] + right_side_id = ['A3', 'A0', 'A1', 'A2', 'A3', 'A0'] + right_index = [3, 0, 1, 2, 3, 0] similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id, 'similarity': similarity, From 5b2ce38bfeb19eec321c68f2db5bf76cc45faa20 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Sat, 17 Apr 2021 09:01:02 +0200 Subject: [PATCH 06/29] attempted to remove n_max_matches restriction altogether --- sparse_dot_topn/.gitignore | 4 - sparse_dot_topn/awesome_cossim_topn.py | 85 +++++++- sparse_dot_topn/sparse_dot_topn.pyx | 73 ++++--- sparse_dot_topn/sparse_dot_topn_source.cpp | 213 +++++++++++++++++++-- sparse_dot_topn/sparse_dot_topn_source.h | 27 ++- 5 files changed, 338 insertions(+), 64 deletions(-) delete mode 100644 sparse_dot_topn/.gitignore diff --git a/sparse_dot_topn/.gitignore b/sparse_dot_topn/.gitignore deleted file mode 100644 index 97caf501..00000000 --- a/sparse_dot_topn/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -/sparse_dot_topn_threaded.cpp -/sparse_dot_topn.cp39-win_amd64.pyd -/sparse_dot_topn.cpp -/sparse_dot_topn_threaded.cp39-win_amd64.pyd diff --git a/sparse_dot_topn/awesome_cossim_topn.py b/sparse_dot_topn/awesome_cossim_topn.py index ee6c2ca2..808f5d8b 100644 --- a/sparse_dot_topn/awesome_cossim_topn.py +++ b/sparse_dot_topn/awesome_cossim_topn.py @@ -40,6 +40,88 @@ def awesome_cossim_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1): nnz_max = M*ntop + # basic check. if A or B are all zeros matrix, return all zero matrix directly + if len(A.indices) == 0 or len(B.indices) == 0: + indptr = np.zeros(M + 1, dtype=idx_dtype) + indices = np.zeros(nnz_max, dtype=idx_dtype) + data = np.zeros(nnz_max, dtype=A.dtype) + return csr_matrix((data, indices, indptr), shape=(M, N)) + + # indptr is the only array whose length is known + indptr = np.empty(M+1, dtype=idx_dtype) + + if not False: + + indices, data = ct.sparse_dot_free( + M, N, np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + A.data, + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + B.data, + lower_bound, + indptr) + # print(f'(M, N) = {(M, N)}') + # print(f'indptr = {indptr}') + # print(f'indptr.flags = {indptr.flags}') + # print(f'indices = {indices}') + # print(f'indices.flags = {indices.flags}') + # print(f'data = {data}') + # print(f'data.flags = {data.flags}') + + else: + if n_jobs < 1: + err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!' + raise ValueError(err_str) + + ct_thread.sparse_dot_topn_threaded( + M, N, np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + A.data, + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + B.data, + ntop, + lower_bound, + indptr, indices, data, n_jobs) + + return csr_matrix((data, indices, indptr), shape=(M, N)) + + +def suspend_awesome_cossim_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1): + """ + This function will return a matrxi C in CSR format, where + C = [sorted top n results and results > lower_bound for each row of A * B] + + Input: + A and B: two CSR matrix + ntop: n top results + lower_bound: a threshold that the element of A*B must greater than + use_threads: use multi-thread or not + n_jobs: number of thread, must be >= 1 + + Output: + C: result matrix + + N.B. if A and B are not CSR format, they will be converted to CSR + """ + if not isspmatrix_csr(A): + A = A.tocsr() + + if not isspmatrix_csr(B): + B = B.tocsr() + + M, K1 = A.shape + K2, N = B.shape + + if K1 != K2: + err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!' + raise ValueError(err_str) + + idx_dtype = np.int32 + + nnz_max = M*ntop + # basic check. if A or B are all zeros matrix, return all zero matrix directly if len(A.indices) == 0 or len(B.indices) == 0: indptr = np.zeros(M + 1, dtype=idx_dtype) @@ -83,6 +165,7 @@ def awesome_cossim_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1): return csr_matrix((data, indices, indptr), shape=(M, N)) + def awesome_cossim_true_minmax_topn_only(A, B, use_threads=False, n_jobs=1): """ This function will return the maximum number of columns set @@ -116,7 +199,7 @@ def awesome_cossim_true_minmax_topn_only(A, B, use_threads=False, n_jobs=1): minmax_topn = np.full(1, 0, dtype=idx_dtype) - # basic check. if A or B are all zeros matrix, return all zero matrix directly + # basic check. if A or B are all zeros matrix, return 0 directly if len(A.indices) == 0 or len(B.indices) == 0: return 0 diff --git a/sparse_dot_topn/sparse_dot_topn.pyx b/sparse_dot_topn/sparse_dot_topn.pyx index 54771132..59ed57bf 100644 --- a/sparse_dot_topn/sparse_dot_topn.pyx +++ b/sparse_dot_topn/sparse_dot_topn.pyx @@ -19,9 +19,17 @@ # distutils: language = c++ -import numpy as np +from libc.stdio cimport printf +from libcpp.vector cimport vector +from libc.stdlib cimport free +from cpython.pycapsule cimport PyCapsule_New, PyCapsule_IsValid, PyCapsule_GetPointer, PyCapsule_GetName cimport numpy as np +np.import_array() + +cdef extern from "numpy/arrayobject.h": + void PyArray_ENABLEFLAGS(np.ndarray arr, int flags) + cdef extern from "sparse_dot_topn_source.h": cdef void sparse_dot_topn_source( @@ -39,7 +47,7 @@ cdef extern from "sparse_dot_topn_source.h": int Cj[], double Cx[]); - cdef void sparse_dot_plus_minmax_topn_source( + cdef void sparse_dot_free_source( int n_row, int n_col, int Ap[], @@ -48,12 +56,10 @@ cdef extern from "sparse_dot_topn_source.h": int Bp[], int Bj[], double Bx[], - int topn, double lower_bound, int Cp[], - int Cj[], - double Cx[], - int minmax_topn[]); + vector[int]* Cj, + vector[double]* Cx); cdef void sparse_dot_only_minmax_topn_source( int n_row, @@ -114,7 +120,13 @@ cpdef sparse_dot_topn( sparse_dot_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx) return -cpdef sparse_dot_plus_minmax_topn( +# destructor +cdef void free_ptr(object cap): + # This should probably have some error checking in + # or at very least clear any errors raised once it's done + free(PyCapsule_GetPointer(cap, PyCapsule_GetName(cap))) + +cpdef sparse_dot_free( int n_row, int n_col, np.ndarray[int, ndim=1] a_indptr, @@ -123,19 +135,13 @@ cpdef sparse_dot_plus_minmax_topn( np.ndarray[int, ndim=1] b_indptr, np.ndarray[int, ndim=1] b_indices, np.ndarray[double, ndim=1] b_data, - int ntop, double lower_bound, - np.ndarray[int, ndim=1] c_indptr, - np.ndarray[int, ndim=1] c_indices, - np.ndarray[double, ndim=1] c_data, - np.ndarray[int, ndim=1] minmax_topn + np.ndarray[int, ndim=1] c_indptr ): """ - Cython glue function to call sparse_dot_plus_minmax_topn C++ implementation + Cython glue function to call sparse_dot_topn C++ implementation This function will return a matrix C in CSR format, where - C = [sorted top n results > lower_bound for each row of A * B]. - It also returns minmax_ntop (the maximum number of columns set - per row over all rows of A * B assuming ntop is infinite) + C = [all results > lower_bound for each row of A * B] Input: n_row: number of rows of A matrix @@ -144,16 +150,13 @@ cpdef sparse_dot_plus_minmax_topn( a_indptr, a_indices, a_data: CSR expression of A matrix b_indptr, b_indices, b_data: CSR expression of B matrix - ntop: n top results lower_bound: a threshold that the element of A*B must greater than Output by reference: c_indptr, c_indices, c_data: CSR expression of C matrix - minmax_ntop: the maximum number of columns set per row over all rows of - A * B assuming ntop is infinite N.B. A and B must be CSR format!!! - The type of input numpy array must be aligned with types of C++ function aguments! + The type of input numpy array must be aligned with types of C++ function arguments! """ cdef int* Ap = &a_indptr[0] @@ -163,12 +166,32 @@ cpdef sparse_dot_plus_minmax_topn( cdef int* Bj = &b_indices[0] cdef double* Bx = &b_data[0] cdef int* Cp = &c_indptr[0] - cdef int* Cj = &c_indices[0] - cdef double* Cx = &c_data[0] - cdef int* o_minmax_topn = &minmax_topn[0] + + cdef vector[int] vCj; + cdef vector[double] vCx; + + sparse_dot_free_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx) + + cdef np.npy_intp nnz = Cp[n_row] + cdef np.ndarray[np.int32_t, ndim=1] c_indices = np.PyArray_SimpleNewFromData(1, &nnz, np.NPY_INT32, vCj.data()) + PyArray_ENABLEFLAGS(c_indices, np.NPY_OWNDATA) + cdef np.ndarray[np.double_t, ndim=1] c_data = np.PyArray_SimpleNewFromData(1, &nnz, np.NPY_DOUBLE, vCx.data()) + PyArray_ENABLEFLAGS(c_data, np.NPY_OWNDATA) + + # cdef const char *name_vCj_capsule = "vCj" + # cdef int* vCj_data = vCj.data() + # vCj_capsule = PyCapsule_New( vCj_data, name_vCj_capsule, &free_ptr) + # if not PyCapsule_IsValid(vCj_capsule, name_vCj_capsule): + # raise ValueError(f"invalid pointer ({name_vCj_capsule}) to parameters") + # + # cdef const char *name_vCx_capsule = "vCx" + # cdef double* vCx_data = vCx.data() + # vCx_capsule = PyCapsule_New( vCx_data, name_vCx_capsule, &free_ptr) + # if not PyCapsule_IsValid(vCx_capsule, name_vCx_capsule): + # raise ValueError(f"invalid pointer ({name_vCx_capsule}) to parameters") + + return c_indices, c_data - sparse_dot_plus_minmax_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, o_minmax_topn) - return cpdef sparse_dot_only_minmax_topn( int n_row, diff --git a/sparse_dot_topn/sparse_dot_topn_source.cpp b/sparse_dot_topn/sparse_dot_topn_source.cpp index dcf99637..c4544790 100644 --- a/sparse_dot_topn/sparse_dot_topn_source.cpp +++ b/sparse_dot_topn/sparse_dot_topn_source.cpp @@ -133,12 +133,10 @@ void sparse_dot_topn_source(int n_row, } /* - C++ implementation of sparse_dot_plus_minmax_topn_source + C++ implementation of sparse_dot_source This function will return a matrix C in CSR format, where - C = [sorted top n results > lower_bound for each row of A * B]. - It also returns minmax_ntop (the maximum number of columns set - per row over all rows of A * B assuming ntop is infinite) + C = [all results > lower_bound sorted for each row of A * B]. Input: n_row: number of rows of A matrix @@ -147,17 +145,15 @@ void sparse_dot_topn_source(int n_row, Ap, Aj, Ax: CSR expression of A matrix Bp, Bj, Bx: CSR expression of B matrix - ntop: n top results + memory_bound: the maximum number of elements per row of C lower_bound: a threshold that the element of A*B must greater than Output by reference: Cp, Cj, Cx: CSR expression of C matrix - minmax_ntop: the maximum number of columns set per row over all - rows of A * B assuming ntop is infinite N.B. A and B must be CSR format!!! */ -void sparse_dot_plus_minmax_topn_source(int n_row, +void sparse_dot_source(int n_row, int n_col, int Ap[], int Aj[], @@ -165,24 +161,22 @@ void sparse_dot_plus_minmax_topn_source(int n_row, int Bp[], int Bj[], double Bx[], //data of B - int ntop, + int memory_bound, double lower_bound, int Cp[], int Cj[], - double Cx[], - int *minmax_ntop) + double Cx[]) { std::vector next(n_col,-1); std::vector sums(n_col, 0); std::vector candidates; + candidates.reserve(memory_bound); int nnz = 0; Cp[0] = 0; - *minmax_ntop = 0; - for(int i = 0; i < n_row; i++){ int head = -2; int length = 0; @@ -207,7 +201,6 @@ void sparse_dot_plus_minmax_topn_source(int n_row, } } } - *minmax_ntop = (length > *minmax_ntop)? length : *minmax_ntop; for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) @@ -226,12 +219,7 @@ void sparse_dot_plus_minmax_topn_source(int n_row, } int len = (int)candidates.size(); - if (len > ntop){ - std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp); - len = ntop; - } else { - std::sort(candidates.begin(), candidates.end(), candidate_cmp); - } + std::sort(candidates.begin(), candidates.end(), candidate_cmp); for(int a=0; a < len; a++){ Cj[nnz] = candidates[a].index; @@ -244,6 +232,191 @@ void sparse_dot_plus_minmax_topn_source(int n_row, } } +/* + C++ implementation of sparse_dot_free_source + + This function will return a matrix C in CSR format, where + C = [all results > lower_bound sorted for each row of A * B]. + + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix + + Ap, Aj, Ax: CSR expression of A matrix + Bp, Bj, Bx: CSR expression of B matrix + + memory_bound: the maximum number of elements per row of C + lower_bound: a threshold that the element of A*B must greater than + + Output by reference: + Cp: C array for idx_pointer of CSR expression of C matrix + Cj: numpy array for indices of CSR expression of C matrix + Cx: numpy array for data values of CSR expression of C matrix + + N.B. A and B must be CSR format!!! +*/ +void sparse_dot_free_source(int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], //data of A + int Bp[], + int Bj[], + double Bx[], //data of B + double lower_bound, + int Cp[], + std::vector* Cj, + std::vector* Cx) +{ + int sz = std::max(n_row, n_col); + Cj->reserve(sz); + Cx->reserve(sz); + + std::vector next(n_col,-1); + std::vector sums(n_col, 0); + + std::vector candidates; + + Cp[0] = 0; + + for(int i = 0; i < n_row; i++){ + int head = -2; + int length = 0; + + int jj_start = Ap[i]; + int jj_end = Ap[i+1]; + for(int jj = jj_start; jj < jj_end; jj++){ + int j = Aj[jj]; + double v = Ax[jj]; //value of A in (i,j) + + int kk_start = Bp[j]; + int kk_end = Bp[j+1]; + for(int kk = kk_start; kk < kk_end; kk++){ + int k = Bj[kk]; //kth column of B in row j + + sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i + + if(next[k] == -1){ + next[k] = head; //keep a linked list, every element points to the next column index + head = k; + length++; + } + } + } + + for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) + + if(sums[head] > lower_bound){ //append the nonzero elements + candidate c; + c.index = head; + c.value = sums[head]; + candidates.push_back(c); + } + + int temp = head; + head = next[head]; //iterate over columns + + next[temp] = -1; //clear arrays + sums[temp] = 0; //clear arrays + } + + int len = (int)candidates.size(); + std::sort(candidates.begin(), candidates.end(), candidate_cmp); + + for(int a=0; a < len; a++){ + Cj->push_back(candidates[a].index); + Cx->push_back(candidates[a].value); + } + candidates.clear(); + + Cp[i+1] = Cj->size(); + } +} + +/* + C++ implementation of sparse_dot_nnz_source + + This function will return the number nnz of nonzero elements + of the matrix C in CSR format, where + C = [all results > lower_bound sorted for each row of A * B] + and ntop the maximum number of elements per row of C. + This function is designed primarily to help with memory management for + very large sparse matrices. + + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix + + Ap, Aj, Ax: CSR expression of A matrix + Bp, Bj, Bx: CSR expression of B matrix + + lower_bound: a threshold that the element of A*B must greater than + + Output: + nnz: number of nonzero elements of matrix C + ntop: maximum number of elements per row of C + + N.B. A and B must be CSR format!!! +*/ +void sparse_dot_nnz_source(int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], //data of A + int Bp[], + int Bj[], + double Bx[], //data of B + double lower_bound, + int* nnz, + int* ntop) +{ + std::vector next(n_col,-1); + std::vector sums(n_col, 0); + + *nnz = 0; + *ntop = 0; + + for(int i = 0; i < n_row; i++){ + int head = -2; + int length = 0; + + int jj_start = Ap[i]; + int jj_end = Ap[i+1]; + for(int jj = jj_start; jj < jj_end; jj++){ + int j = Aj[jj]; + double v = Ax[jj]; //value of A in (i,j) + + int kk_start = Bp[j]; + int kk_end = Bp[j+1]; + for(int kk = kk_start; kk < kk_end; kk++){ + int k = Bj[kk]; //kth column of B in row j + + sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i + + if(next[k] == -1){ + next[k] = head; //keep a linked list, every element points to the next column index + head = k; + length++; + } + } + } + + int nnz_k = 0; + for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) + + if(sums[head] > lower_bound) nnz_k++; //count this nonzero element in + + int temp = head; + head = next[head]; //iterate over columns + + next[temp] = -1; //clear arrays + sums[temp] = 0; //clear arrays + } + *ntop = (nnz_k > *ntop)? nnz_k : *ntop; + *nnz += nnz_k; + } +} + /* C++ implementation of sparse_dot_only_minmax_topn_source diff --git a/sparse_dot_topn/sparse_dot_topn_source.h b/sparse_dot_topn/sparse_dot_topn_source.h index 6143eb93..664378e3 100644 --- a/sparse_dot_topn/sparse_dot_topn_source.h +++ b/sparse_dot_topn/sparse_dot_topn_source.h @@ -23,6 +23,7 @@ #ifndef UTILS_CPPCLASS_H #define UTILS_CPPCLASS_H + struct candidate {int index; double value;}; extern bool candidate_cmp(candidate c_i, candidate c_j); @@ -41,20 +42,18 @@ extern void sparse_dot_topn_source(int n_row, int Cj[], double Cx[]); //data of C -extern void sparse_dot_plus_minmax_topn_source(int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], //data of A - int Bp[], - int Bj[], - double Bx[], //data of B - int ntop, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], //data of C - int *minmax_topn); +extern void sparse_dot_free_source(int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], //data of A + int Bp[], + int Bj[], + double Bx[], //data of B + double lower_bound, + int Cp[], + std::vector* Cj, + std::vector* Cx); extern void sparse_dot_only_minmax_topn_source(int n_row, int n_col, From d6f31278636287880e5f66d9ef6290dedcb52732 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Sat, 17 Apr 2021 09:48:44 +0200 Subject: [PATCH 07/29] removed the restriction n_max_matches put on memory allocation --- setup.py | 19 +- sparse_dot_topn/__init__.py | 7 +- sparse_dot_topn/array_wrappers.pxd | 18 + sparse_dot_topn/array_wrappers.pyx | 73 ++ sparse_dot_topn/awesome_cossim_topn.py | 263 +++---- sparse_dot_topn/example/comparison.py | 137 ++++ sparse_dot_topn/example/comparison2.py | 169 ++++ sparse_dot_topn/example/example.py | 14 + sparse_dot_topn/sparse_dot_topn.pyx | 261 ++++--- sparse_dot_topn/sparse_dot_topn_parallel.cpp | 731 +++++++++++------- sparse_dot_topn/sparse_dot_topn_parallel.h | 97 ++- sparse_dot_topn/sparse_dot_topn_source.cpp | 159 ++-- sparse_dot_topn/sparse_dot_topn_source.h | 68 +- sparse_dot_topn/sparse_dot_topn_threaded.pyx | 239 +++--- .../test/test_awesome_cossim_topn.py | 346 +++++++++ string_grouper/string_grouper.py | 25 +- string_grouper/test/test_string_grouper.py | 2 +- 17 files changed, 1892 insertions(+), 736 deletions(-) create mode 100644 sparse_dot_topn/array_wrappers.pxd create mode 100644 sparse_dot_topn/array_wrappers.pyx create mode 100644 sparse_dot_topn/example/comparison.py create mode 100644 sparse_dot_topn/example/comparison2.py create mode 100644 sparse_dot_topn/example/example.py create mode 100644 sparse_dot_topn/test/test_awesome_cossim_topn.py diff --git a/setup.py b/setup.py index 535aa5c7..5cb9c5e0 100644 --- a/setup.py +++ b/setup.py @@ -29,9 +29,19 @@ def finalize_options(self): else: extra_compile_args = ['-std=c++0x', '-pthread', '-O3'] +array_wrappers_ext = Extension('sparse_dot_topn.array_wrappers', + sources=[ + './sparse_dot_topn/array_wrappers.pyx', + './sparse_dot_topn/sparse_dot_topn_source.cpp' + ], + extra_compile_args=extra_compile_args, + language='c++') + original_ext = Extension('sparse_dot_topn.sparse_dot_topn', - sources=['./sparse_dot_topn/sparse_dot_topn.pyx', - './sparse_dot_topn/sparse_dot_topn_source.cpp'], + sources=[ + './sparse_dot_topn/sparse_dot_topn.pyx', + './sparse_dot_topn/sparse_dot_topn_source.cpp' + ], extra_compile_args=extra_compile_args, language='c++') @@ -39,7 +49,8 @@ def finalize_options(self): sources=[ './sparse_dot_topn/sparse_dot_topn_threaded.pyx', './sparse_dot_topn/sparse_dot_topn_source.cpp', - './sparse_dot_topn/sparse_dot_topn_parallel.cpp'], + './sparse_dot_topn/sparse_dot_topn_parallel.cpp' + ], extra_compile_args=extra_compile_args, language='c++') @@ -77,5 +88,5 @@ def finalize_options(self): , 'pandas>=0.25.3' ], cmdclass={'build_ext': my_build_ext}, - ext_modules=[original_ext, threaded_ext] + ext_modules=[array_wrappers_ext, original_ext, threaded_ext] ) diff --git a/sparse_dot_topn/__init__.py b/sparse_dot_topn/__init__.py index d7e882f0..cbaf32a7 100644 --- a/sparse_dot_topn/__init__.py +++ b/sparse_dot_topn/__init__.py @@ -1,2 +1,7 @@ # flake8: noqa -from sparse_dot_topn.awesome_cossim_topn import awesome_cossim_topn, awesome_cossim_true_minmax_topn_only +import sys + +if sys.version_info[0] >= 3: + from sparse_dot_topn.awesome_cossim_topn import awesome_cossim_topn +else: + from awesome_cossim_topn import awesome_cossim_topn \ No newline at end of file diff --git a/sparse_dot_topn/array_wrappers.pxd b/sparse_dot_topn/array_wrappers.pxd new file mode 100644 index 00000000..f3342ef5 --- /dev/null +++ b/sparse_dot_topn/array_wrappers.pxd @@ -0,0 +1,18 @@ +from libcpp.vector cimport vector + +# define a Cython array wrapper class to hold a C++ vector of ints, adhering to numpy's buffer protocol: +cdef class ArrayWrapper_int: + cdef int view_count + cdef vector[int] vec + cdef Py_ssize_t shape[2] + cdef Py_ssize_t strides[2] + + +# define a Cython array wrapper class to hold a C++ vector of doubles, adhering to numpy's buffer protocol: +cdef class ArrayWrapper_double: + cdef int view_count + cdef vector[double] vec + cdef Py_ssize_t shape[2] + cdef Py_ssize_t strides[2] + + diff --git a/sparse_dot_topn/array_wrappers.pyx b/sparse_dot_topn/array_wrappers.pyx new file mode 100644 index 00000000..d0dd4f3e --- /dev/null +++ b/sparse_dot_topn/array_wrappers.pyx @@ -0,0 +1,73 @@ +from cpython cimport Py_buffer +from libcpp.vector cimport vector + +# define a Cython array wrapper class to hold a C++ vector of ints, adhering to numpy's buffer protocol: +cdef class ArrayWrapper_int: + # constructor and destructor are fairly unimportant now since + # vec will be destroyed automatically. + + def __cinit__(self, vector[int]& data): + self.vec.swap(data) + self.view_count = 0 + + # now implement the buffer protocol for the class + # which makes it generally useful to anything that expects an array + def __getbuffer__(self, Py_buffer *buffer, int flags): + # relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class + cdef Py_ssize_t itemsize = sizeof(self.vec[0]) + + self.shape[1] = self.vec.size() + self.shape[0] = 1 + self.strides[1] = ( &(self.vec[1]) - &(self.vec[0])) + self.strides[0] = self.vec.size() * self.strides[1] + buffer.buf = &(self.vec[0]) + buffer.format = 'i' + buffer.internal = NULL + buffer.itemsize = itemsize + buffer.len = self.vec.size() * itemsize # product(shape) * itemsize + buffer.ndim = 2 + buffer.obj = self + buffer.readonly = 0 + buffer.shape = self.shape + buffer.strides = self.strides + buffer.suboffsets = NULL + self.view_count += 1 + + def __releasebuffer__(self, Py_buffer *buffer): + self.view_count -= 1 + + +# define a Cython array wrapper class to hold a C++ vector of doubles, adhering to numpy's buffer protocol: +cdef class ArrayWrapper_double: + # constructor and destructor are fairly unimportant now since + # vec will be destroyed automatically. + + def __cinit__(self, vector[double]& data): + self.vec.swap(data) + self.view_count = 0 + + # now implement the buffer protocol for the class + # which makes it generally useful to anything that expects an array + def __getbuffer__(self, Py_buffer *buffer, int flags): + # relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class + cdef Py_ssize_t itemsize = sizeof(self.vec[0]) + + self.shape[1] = self.vec.size() + self.shape[0] = 1 + self.strides[1] = ( &(self.vec[1]) - &(self.vec[0])) + self.strides[0] = self.vec.size() * self.strides[1] + buffer.buf = &(self.vec[0]) + buffer.format = 'd' + buffer.internal = NULL + buffer.itemsize = itemsize + buffer.len = self.vec.size() * itemsize # product(shape) * itemsize + buffer.ndim = 2 + buffer.obj = self + buffer.readonly = 0 + buffer.shape = self.shape + buffer.strides = self.strides + buffer.suboffsets = NULL + self.view_count += 1 + + def __releasebuffer__(self, Py_buffer *buffer): + self.view_count -= 1 diff --git a/sparse_dot_topn/awesome_cossim_topn.py b/sparse_dot_topn/awesome_cossim_topn.py index 808f5d8b..6e459b29 100644 --- a/sparse_dot_topn/awesome_cossim_topn.py +++ b/sparse_dot_topn/awesome_cossim_topn.py @@ -1,27 +1,55 @@ +import sys import numpy as np from scipy.sparse import csr_matrix from scipy.sparse import isspmatrix_csr -from sparse_dot_topn import sparse_dot_topn as ct -from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread - - -def awesome_cossim_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1): +if sys.version_info[0] >= 3: + from sparse_dot_topn import sparse_dot_topn as ct + from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread +else: + import sparse_dot_topn as ct + import sparse_dot_topn_threaded as ct_thread + + +def awesome_cossim_topn( + A, + B, + ntop, + lower_bound=0, + use_threads=False, + n_jobs=1, + ntop_is_flexible=False, + mem_manager_is_C=False, + return_best_topn=False + ): """ - This function will return a matrxi C in CSR format, where - C = [sorted top n results and results > lower_bound for each row of A * B] + This function will return a matrix C in CSR format, where + C = [sorted top n results > lower_bound for each row of A * B]. + If return_best_topn=True it will also return best_topn (the + true maximum number of elements > lower_bound per row of A * B). Input: - A and B: two CSR matrix + A and B: two CSR matrices ntop: n top results lower_bound: a threshold that the element of A*B must greater than - use_threads: use multi-thread or not + use_threads: use multi-thread or not n_jobs: number of thread, must be >= 1 + ntop_is_flexible: if True, memory management will be handed over to C/C++ if + python's attempt at allocating memory fails. + mem_manager_is_C: (this is mainly for testing purposes) if True, will force + memory management to be handed over to C/C++. Should be + used only when ntop >= number of columns of B or + ntop_is_flexible=True. Defaults to False. + return_best_topn: if True, will return best_topn together with C as a tuple: + (C, best_topn) Output: - C: result matrix + C: result matrix (returned alone, if return_best_topn=False) + best_topn: The true maximum number of elements > lower_bound per row of + A * B returned together with C as a tuple: (C, best_topn). It is + returned only if return_best_topn=True. - N.B. if A and B are not CSR format, they will be converted to CSR + N.B. if A and B are not in CSR format, they will be converted to CSR """ if not isspmatrix_csr(A): A = A.tocsr() @@ -45,128 +73,105 @@ def awesome_cossim_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1): indptr = np.zeros(M + 1, dtype=idx_dtype) indices = np.zeros(nnz_max, dtype=idx_dtype) data = np.zeros(nnz_max, dtype=A.dtype) - return csr_matrix((data, indices, indptr), shape=(M, N)) - - # indptr is the only array whose length is known - indptr = np.empty(M+1, dtype=idx_dtype) - - if not False: - - indices, data = ct.sparse_dot_free( - M, N, np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - A.data, - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - B.data, - lower_bound, - indptr) - # print(f'(M, N) = {(M, N)}') - # print(f'indptr = {indptr}') - # print(f'indptr.flags = {indptr.flags}') - # print(f'indices = {indices}') - # print(f'indices.flags = {indices.flags}') - # print(f'data = {data}') - # print(f'data.flags = {data.flags}') - - else: - if n_jobs < 1: - err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!' - raise ValueError(err_str) - - ct_thread.sparse_dot_topn_threaded( - M, N, np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - A.data, - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - B.data, - ntop, - lower_bound, - indptr, indices, data, n_jobs) - - return csr_matrix((data, indices, indptr), shape=(M, N)) - - -def suspend_awesome_cossim_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1): - """ - This function will return a matrxi C in CSR format, where - C = [sorted top n results and results > lower_bound for each row of A * B] - - Input: - A and B: two CSR matrix - ntop: n top results - lower_bound: a threshold that the element of A*B must greater than - use_threads: use multi-thread or not - n_jobs: number of thread, must be >= 1 - - Output: - C: result matrix - - N.B. if A and B are not CSR format, they will be converted to CSR - """ - if not isspmatrix_csr(A): - A = A.tocsr() - - if not isspmatrix_csr(B): - B = B.tocsr() - - M, K1 = A.shape - K2, N = B.shape - - if K1 != K2: - err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!' - raise ValueError(err_str) - - idx_dtype = np.int32 - - nnz_max = M*ntop - - # basic check. if A or B are all zeros matrix, return all zero matrix directly - if len(A.indices) == 0 or len(B.indices) == 0: - indptr = np.zeros(M + 1, dtype=idx_dtype) - indices = np.zeros(nnz_max, dtype=idx_dtype) - data = np.zeros(nnz_max, dtype=A.dtype) - return csr_matrix((data, indices, indptr), shape=(M, N)) + output = csr_matrix((data, indices, indptr), shape=(M, N)) + if return_best_topn: + return output, 0 + else: + return output # filled matrices from here on indptr = np.empty(M+1, dtype=idx_dtype) - indices = np.empty(nnz_max, dtype=idx_dtype) - data = np.empty(nnz_max, dtype=A.dtype) - - if not use_threads: - - ct.sparse_dot_topn( - M, N, np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - A.data, - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - B.data, - ntop, - lower_bound, - indptr, indices, data) - + try: + indices = np.empty(nnz_max, dtype=idx_dtype) + data = np.empty(nnz_max, dtype=A.dtype) + if mem_manager_is_C: raise MemoryError # This is mainly for testing purposes + except MemoryError: + # if mem_manager_is_C: print('Exception raised! Continuing ...', flush=True) + if ntop_is_flexible or ntop >= N: + # It is likely you are here because nnz_max is too large. But don't give up just yet! + # sparse_dot_topn will hand over the memory allocation/management to C++. C++ will + # grow the memory allocations for these arrays as needed without any need for nnz_max. + # Note that reallocations could occur causing data to be copied to other locations + # in memory thus impacting performance + indices = np.empty(0, dtype=idx_dtype) + data = np.empty(0, dtype=A.dtype) + if not use_threads: + + indices, data, best_topn = ct.sparse_dot_free( + M, N, np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + A.data, + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + B.data, + lower_bound, + indptr + ) + + else: + + indices, data, best_topn = ct_thread.sparse_dot_free_threaded( + M, N, np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + A.data, + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + B.data, + lower_bound, + indptr, n_jobs + ) + + else: + if mem_manager_is_C: + raise Exception('When mem_manager_is_C=True, set ntop >= N, or set ntop_is_flexible=True') + else: + raise Exception('Not enough memory! Data array is too large. Try reducing the value of ntop.') + else: - if n_jobs < 1: - err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!' - raise ValueError(err_str) - - ct_thread.sparse_dot_topn_threaded( - M, N, np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - A.data, - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - B.data, - ntop, - lower_bound, - indptr, indices, data, n_jobs) - - return csr_matrix((data, indices, indptr), shape=(M, N)) + + best_topn_arr = np.full(1, 0, dtype=idx_dtype) + + if not use_threads: + + ct.sparse_dot_topn_extd( + M, N, np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + A.data, + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + B.data, + ntop, + lower_bound, + indptr, indices, data, best_topn_arr + ) + + else: + if n_jobs < 1: + err_str = 'Whenever you select the multi-thread mode, n_job must be greater than or equal to 1!' + raise ValueError(err_str) + + ct_thread.sparse_dot_topn_extd_threaded( + M, N, np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + A.data, + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + B.data, + ntop, + lower_bound, + indptr, indices, data, best_topn_arr, n_jobs + ) + + best_topn = best_topn_arr[0] + + output = csr_matrix((data, indices, indptr), shape=(M, N)) + if return_best_topn: + return output, best_topn + else: + return output -def awesome_cossim_true_minmax_topn_only(A, B, use_threads=False, n_jobs=1): +def awesome_cossim_only_max_nnz_col(A, B, use_threads=False, n_jobs=1): """ This function will return the maximum number of columns set per row over all rows of A * B @@ -205,7 +210,7 @@ def awesome_cossim_true_minmax_topn_only(A, B, use_threads=False, n_jobs=1): if not use_threads: - ct.sparse_dot_only_minmax_topn( + ct.sparse_dot_only_max_nnz_col( M, N, np.asarray(A.indptr, dtype=idx_dtype), np.asarray(A.indices, dtype=idx_dtype), @@ -218,7 +223,7 @@ def awesome_cossim_true_minmax_topn_only(A, B, use_threads=False, n_jobs=1): err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!' raise ValueError(err_str) - ct_thread.sparse_dot_only_minmax_topn_threaded( + ct_thread.sparse_dot_only_max_nnz_col_threaded( M, N, np.asarray(A.indptr, dtype=idx_dtype), np.asarray(A.indices, dtype=idx_dtype), diff --git a/sparse_dot_topn/example/comparison.py b/sparse_dot_topn/example/comparison.py new file mode 100644 index 00000000..7ee673ca --- /dev/null +++ b/sparse_dot_topn/example/comparison.py @@ -0,0 +1,137 @@ +""" +This file compare our boosting method with calling scipy+numpy function directly +""" + +from __future__ import print_function +import timeit +import numpy as np +from scipy.sparse import coo_matrix +from sparse_dot_topn import awesome_cossim_topn # noqa: F401 + +N = 1000 +thresh = 0.01 + +nr_vocab = 2 << 24 +density = 1e-6 +n_samples = 1000000 +n_duplicates = 1000000 +nnz_a = int(n_samples * nr_vocab * density) +nnz_b = int(n_duplicates * nr_vocab * density) + + +print(f'density = {density}', flush=True) +print(f'nr_vocab = {nr_vocab}', flush=True) +print(f'n_samples = {n_samples}', flush=True) +print(f'n_duplicates = {n_duplicates}', flush=True) +print(f'nnz_a = {nnz_a}', flush=True) +print(f'nnz_b = {nnz_b}', flush=True) +print('\n', flush=True) + +rng1 = np.random.RandomState(42) +rng2 = np.random.RandomState(43) + +row = rng1.randint(n_samples, size=nnz_a) +cols = rng2.randint(nr_vocab, size=nnz_a) +data = rng1.rand(nnz_a) + +a_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab)) +a = a_sparse.tocsr() + +row = rng1.randint(n_duplicates, size=nnz_b) +cols = rng2.randint(nr_vocab, size=nnz_b) +data = rng1.rand(nnz_b) + +b_sparse = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab)) +b = b_sparse.T.tocsr() + + +# top 5 results per row + +print("Original sparse_dot_topn function") + +rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh)', + number=3, + globals=globals()) +print(rtv) + +print("Threaded function with 1 thread") + +rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1)', + number=3, + globals=globals()) +print(rtv) + +print("Threaded function with 2 threads") + +rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2)', + number=3, + globals=globals()) +print(rtv) + +print("Threaded function with 3 threads") + +rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3)', + number=3, + globals=globals()) +print(rtv) + +print("Threaded function with 4 threads") + +rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4)', + number=3, + globals=globals()) +print(rtv) + +print("Threaded function with 5 threads") + +rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5)', + number=3, + globals=globals()) +print(rtv) + +print("Threaded function with 6 threads") + +rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6)', + number=3, + globals=globals()) +print(rtv) + +print("Threaded function with 7 threads") + +rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7)', + number=3, + globals=globals()) +print(rtv) + +# use scipy and numpy function + + +def get_csr_ntop_idx_data(csr_row, ntop): + """ + Get list (row index, score) of the n top matches + """ + nnz = csr_row.getnnz() + if nnz == 0: + return None + elif nnz <= ntop: + result = zip(csr_row.indices, csr_row.data) + else: + arg_idx = np.argpartition(csr_row.data, -ntop)[-ntop:] + result = zip(csr_row.indices[arg_idx], csr_row.data[arg_idx]) + + return sorted(result, key=lambda x: -x[1]) + + +def scipy_cossim_top(A, B, ntop, lower_bound=0): + C = A.dot(B) + return [get_csr_ntop_idx_data(row, ntop) for row in C] + +# top 5 results per row which element is greater than 2 + + +print("Scipy+numpy original function") + +rtv = timeit.timeit('scipy_cossim_top(a, b, N, thresh)', + number=3, + globals=globals()) +print(rtv) diff --git a/sparse_dot_topn/example/comparison2.py b/sparse_dot_topn/example/comparison2.py new file mode 100644 index 00000000..7af5d08a --- /dev/null +++ b/sparse_dot_topn/example/comparison2.py @@ -0,0 +1,169 @@ +""" +This file compare our boosting method with calling scipy+numpy function directly +""" + +from __future__ import print_function +import timeit +import numpy as np +from scipy.sparse import coo_matrix +from sparse_dot_topn import awesome_cossim_topn # noqa: F401 + +N = 1000 +thresh = 0.01 + +nr_vocab = 2 << 24 +density = 1e-6 +n_samples = 1000000 +n_duplicates = N +nnz_a = int(n_samples * nr_vocab * density) +nnz_b = int(n_duplicates * nr_vocab * density) + +print(f'density = {density}', flush=True) +print(f'nr_vocab = {nr_vocab}', flush=True) +print(f'n_samples = {n_samples}', flush=True) +print(f'n_duplicates = {n_duplicates}', flush=True) +print(f'nnz_a = {nnz_a}', flush=True) +print(f'nnz_b = {nnz_b}', flush=True) +print('', flush=True) + +rng1 = np.random.RandomState(42) +rng2 = np.random.RandomState(43) + +row = rng1.randint(n_samples, size=nnz_a) +cols = rng2.randint(nr_vocab, size=nnz_a) +data = rng1.rand(nnz_a) + +a_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab)) +a = a_sparse.tocsr() + +row = rng1.randint(n_duplicates, size=nnz_b) +cols = rng2.randint(nr_vocab, size=nnz_b) +data = rng1.rand(nnz_b) + +b_sparse = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab)) +b = b_sparse.T.tocsr() + + +# top 5 results per row + +print("Non-parallelized sparse_dot_topn function") + +rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh)', + number=3, + globals=globals()) +rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, mem_manager_is_C=True)', + number=3, + globals=globals()) +print('python\t\tC/C++', flush=True) +print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + +print("Threaded function with 1 thread") + +rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1)', + number=3, + globals=globals()) +rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1, mem_manager_is_C=True)', + number=3, + globals=globals()) +print('python\t\tC/C++', flush=True) +print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + +print("Threaded function with 2 threads") + +rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2)', + number=3, + globals=globals()) +rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2, mem_manager_is_C=True)', + number=3, + globals=globals()) +print('python\t\tC/C++', flush=True) +print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + +print("Threaded function with 3 threads") + +rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3)', + number=3, + globals=globals()) +rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3, mem_manager_is_C=True)', + number=3, + globals=globals()) +print('python\t\tC/C++', flush=True) +print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + +print("Threaded function with 4 threads") + +rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4)', + number=3, + globals=globals()) +rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4, mem_manager_is_C=True)', + number=3, + globals=globals()) +print('python\t\tC/C++', flush=True) +print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + +print("Threaded function with 5 threads") + +rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5)', + number=3, + globals=globals()) +rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5, mem_manager_is_C=True)', + number=3, + globals=globals()) +print('python\t\tC/C++', flush=True) +print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + +print("Threaded function with 6 threads") + +rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6)', + number=3, + globals=globals()) +rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6, mem_manager_is_C=True)', + number=3, + globals=globals()) +print('python\t\tC/C++', flush=True) +print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + +print("Threaded function with 7 threads") + +rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7)', + number=3, + globals=globals()) +rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7, mem_manager_is_C=True)', + number=3, + globals=globals()) +print('python\t\tC/C++', flush=True) +print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + + +# use scipy and numpy function + + +def get_csr_ntop_idx_data(csr_row, ntop): + """ + Get list (row index, score) of the n top matches + """ + nnz = csr_row.getnnz() + if nnz == 0: + return None + elif nnz <= ntop: + result = zip(csr_row.indices, csr_row.data) + else: + arg_idx = np.argpartition(csr_row.data, -ntop)[-ntop:] + result = zip(csr_row.indices[arg_idx], csr_row.data[arg_idx]) + + return sorted(result, key=lambda x: -x[1]) + + +def scipy_cossim_top(A, B, ntop, lower_bound=0): + C = A.dot(B) + return [get_csr_ntop_idx_data(row, ntop) for row in C] + +# top 5 results per row which element is greater than 2 + + +print("Scipy+numpy original function") + +rtv = timeit.timeit('scipy_cossim_top(a, b, N, thresh)', + number=3, + globals=globals()) +print(rtv) diff --git a/sparse_dot_topn/example/example.py b/sparse_dot_topn/example/example.py new file mode 100644 index 00000000..a61951fd --- /dev/null +++ b/sparse_dot_topn/example/example.py @@ -0,0 +1,14 @@ +from scipy.sparse import rand +from sparse_dot_topn import awesome_cossim_topn + +N = 10 +a = rand(100, 1000000, density=0.005, format='csr') +b = rand(1000000, 200, density=0.005, format='csr') + +# Use standard implementation + +c = awesome_cossim_topn(a, b, 5, 0.01) + +# Use parallel implementation with 4 threads + +d = awesome_cossim_topn(a, b, 5, 0.01, use_threads=True, n_jobs=4) diff --git a/sparse_dot_topn/sparse_dot_topn.pyx b/sparse_dot_topn/sparse_dot_topn.pyx index 59ed57bf..9c35d3e9 100644 --- a/sparse_dot_topn/sparse_dot_topn.pyx +++ b/sparse_dot_topn/sparse_dot_topn.pyx @@ -19,72 +19,91 @@ # distutils: language = c++ -from libc.stdio cimport printf from libcpp.vector cimport vector -from libc.stdlib cimport free -from cpython.pycapsule cimport PyCapsule_New, PyCapsule_IsValid, PyCapsule_GetPointer, PyCapsule_GetName +from array_wrappers cimport ArrayWrapper_int, ArrayWrapper_double + cimport numpy as np +import numpy as np np.import_array() -cdef extern from "numpy/arrayobject.h": - void PyArray_ENABLEFLAGS(np.ndarray arr, int flags) cdef extern from "sparse_dot_topn_source.h": cdef void sparse_dot_topn_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int topn, - double lower_bound, - int Cp[], - int Cj[], - double Cx[]); + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int topn, + double lower_bound, + int Cp[], + int Cj[], + double Cx[] + ); + + cdef void sparse_dot_topn_extd_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int topn, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int* nminmax + ); cdef void sparse_dot_free_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - double lower_bound, - int Cp[], - vector[int]* Cj, - vector[double]* Cx); + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + double lower_bound, + int Cp[], + vector[int]* Cj, + vector[double]* Cx, + int* n_minmax + ); + + cdef void sparse_dot_only_max_nnz_col_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + int Bp[], + int Bj[], + int* max_nnz_col + ); - cdef void sparse_dot_only_minmax_topn_source( +cpdef sparse_dot_topn( int n_row, int n_col, - int Ap[], - int Aj[], - int Bp[], - int Bj[], - int minmax_topn[]); - -cpdef sparse_dot_topn( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr, - np.ndarray[int, ndim=1] c_indices, - np.ndarray[double, ndim=1] c_data - ): + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr, + np.ndarray[int, ndim=1] c_indices, + np.ndarray[double, ndim=1] c_data + ): """ Cython glue function to call sparse_dot_topn C++ implementation This function will return a matrix C in CSR format, where @@ -104,7 +123,7 @@ cpdef sparse_dot_topn( c_indptr, c_indices, c_data: CSR expression of C matrix N.B. A and B must be CSR format!!! - The type of input numpy array must be aligned with types of C++ function aguments! + The type of input numpy array must be aligned with types of C++ function arguments! """ cdef int* Ap = &a_indptr[0] @@ -120,28 +139,79 @@ cpdef sparse_dot_topn( sparse_dot_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx) return -# destructor -cdef void free_ptr(object cap): - # This should probably have some error checking in - # or at very least clear any errors raised once it's done - free(PyCapsule_GetPointer(cap, PyCapsule_GetName(cap))) +cpdef sparse_dot_topn_extd( + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr, + np.ndarray[int, ndim=1] c_indices, + np.ndarray[double, ndim=1] c_data, + np.ndarray[int, ndim=1] nminmax, + ): + """ + Cython glue function to call sparse_dot_topn C++ implementation + This function will return a matrix C in CSR format, where + C = [sorted top n results > lower_bound for each row of A * B] + The maximum number of elements per row of C nminmax is also returned. + + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix + + a_indptr, a_indices, a_data: CSR expression of A matrix + b_indptr, b_indices, b_data: CSR expression of B matrix + + ntop: n top results + lower_bound: a threshold that the element of A*B must greater than + + Output by reference: + c_indptr, c_indices, c_data: CSR expression of C matrix + nminmax: The maximum number of elements per row of C + + N.B. A and B must be CSR format!!! + The type of input numpy array must be aligned with types of C++ function arguments! + """ + + cdef int* Ap = &a_indptr[0] + cdef int* Aj = &a_indices[0] + cdef double* Ax = &a_data[0] + cdef int* Bp = &b_indptr[0] + cdef int* Bj = &b_indices[0] + cdef double* Bx = &b_data[0] + cdef int* Cp = &c_indptr[0] + cdef int* Cj = &c_indices[0] + cdef double* Cx = &c_data[0] + cdef int* n_minmax = &nminmax[0] + + sparse_dot_topn_extd_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax) + return cpdef sparse_dot_free( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr - ): + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr + ): """ - Cython glue function to call sparse_dot_topn C++ implementation + Cython glue function to call sparse_dot_free C++ implementation This function will return a matrix C in CSR format, where C = [all results > lower_bound for each row of A * B] + This function lets C++ decide how to manage (grow/allocate/reallocate) memory for the + storage of these results as needed during the computation; then hands over to numpy + a pointer to the memory location where the data resides Input: n_row: number of rows of A matrix @@ -166,42 +236,29 @@ cpdef sparse_dot_free( cdef int* Bj = &b_indices[0] cdef double* Bx = &b_data[0] cdef int* Cp = &c_indptr[0] + cdef np.ndarray[int, ndim=1] nminmax = np.array([0], dtype=np.int32) + cdef int* n_minmax = &nminmax[0] cdef vector[int] vCj; cdef vector[double] vCx; - sparse_dot_free_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx) - - cdef np.npy_intp nnz = Cp[n_row] - cdef np.ndarray[np.int32_t, ndim=1] c_indices = np.PyArray_SimpleNewFromData(1, &nnz, np.NPY_INT32, vCj.data()) - PyArray_ENABLEFLAGS(c_indices, np.NPY_OWNDATA) - cdef np.ndarray[np.double_t, ndim=1] c_data = np.PyArray_SimpleNewFromData(1, &nnz, np.NPY_DOUBLE, vCx.data()) - PyArray_ENABLEFLAGS(c_data, np.NPY_OWNDATA) + sparse_dot_free_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx, n_minmax) - # cdef const char *name_vCj_capsule = "vCj" - # cdef int* vCj_data = vCj.data() - # vCj_capsule = PyCapsule_New( vCj_data, name_vCj_capsule, &free_ptr) - # if not PyCapsule_IsValid(vCj_capsule, name_vCj_capsule): - # raise ValueError(f"invalid pointer ({name_vCj_capsule}) to parameters") - # - # cdef const char *name_vCx_capsule = "vCx" - # cdef double* vCx_data = vCx.data() - # vCx_capsule = PyCapsule_New( vCx_data, name_vCx_capsule, &free_ptr) - # if not PyCapsule_IsValid(vCx_capsule, name_vCx_capsule): - # raise ValueError(f"invalid pointer ({name_vCx_capsule}) to parameters") + c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0) + c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0) - return c_indices, c_data - - -cpdef sparse_dot_only_minmax_topn( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[int, ndim=1] minmax_topn - ): + return c_indices, c_data, nminmax[0] + + +cpdef sparse_dot_only_max_nnz_col( + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[int, ndim=1] minmax_topn + ): """ Cython glue function to call sparse_dot_only_minmax_topn C++ implementation This function will return the maximum number of columns set @@ -228,5 +285,5 @@ cpdef sparse_dot_only_minmax_topn( cdef int* Bj = &b_indices[0] cdef int* o_minmax_topn = &minmax_topn[0] - sparse_dot_only_minmax_topn_source(n_row, n_col, Ap, Aj, Bp, Bj, o_minmax_topn) - return \ No newline at end of file + sparse_dot_only_max_nnz_col_source(n_row, n_col, Ap, Aj, Bp, Bj, o_minmax_topn) + return diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/sparse_dot_topn/sparse_dot_topn_parallel.cpp index c2b9a0b9..d941248e 100644 --- a/sparse_dot_topn/sparse_dot_topn_parallel.cpp +++ b/sparse_dot_topn/sparse_dot_topn_parallel.cpp @@ -23,355 +23,563 @@ #include #include #include +#include #include #include #include "./sparse_dot_topn_source.h" #include "./sparse_dot_topn_parallel.h" -void inner_sparse_function(int start_row, int end_row, int n_col_inner, - int ntop_inner, double lower_bound_inner, int Ap_copy[], - int Aj_copy[], double Ax_copy[], int Bp_copy[], int Bj_copy[], - double Bx_copy[], std::vector real_candidates[]) -{ - -std::vector next(n_col_inner,-1); -std::vector sums(n_col_inner, 0); -std::vector temp_candidates; +void distribute_load( + int load_sz, + int n_jobs, + std::vector> &ranges +) +{ + // share the load among jobs: + int equal_job_load_sz = load_sz/n_jobs; + int rem = load_sz % n_jobs; + ranges.resize(n_jobs); -int iterations_count = 0; + int start = 0; + for (int job_nr = 0; job_nr < n_jobs; job_nr++) { + std::vector temp_vector(2, 0); -for(int i = start_row; i < end_row; i++){ + temp_vector[0] = start; + temp_vector[1] = start + equal_job_load_sz + ((job_nr < rem)? 1 : 0); + start = temp_vector[1]; - iterations_count += 1; + ranges[job_nr] = temp_vector; + } +} - int head = -2; - int length = 0; +void inner_gather_function( + int start_row, + int end_row, + int Cp[], + int Cp_start, + int vCj_start[], + double vCx_start[], + std::vector real_candidates[] +) +{ + int Cp_i = Cp_start; + int* vCj_cursor = &vCj_start[Cp_start]; + double* vCx_cursor = &vCx_start[Cp_start]; + candidate c; + for (int i = start_row; i < end_row; i++){ + Cp_i += (int) real_candidates[i].size(); + Cp[i + 1] = Cp_i; + for (unsigned int j = 0; j < real_candidates[i].size(); j++){ + c = real_candidates[i][j]; + *(vCj_cursor++) = c.index; + *(vCx_cursor++) = c.value; + } + real_candidates[i].clear(); + } +} - int jj_start = Ap_copy[i]; - int jj_end = Ap_copy[i+1]; +void inner_sparse_dot_topn( + int start_row, + int end_row, + int n_col_inner, + int ntop_inner, + double lower_bound_inner, + int Ap_copy[], + int Aj_copy[], + double Ax_copy[], + int Bp_copy[], + int Bj_copy[], + double Bx_copy[], + std::vector real_candidates[], + int* total +) +{ + std::vector next(n_col_inner,-1); + std::vector sums(n_col_inner, 0); - for(int jj = jj_start; jj < jj_end; jj++){ - int j = Aj_copy[jj]; - double v = Ax_copy[jj]; //value of A in (i,j) + std::vector temp_candidates; - int kk_start = Bp_copy[j]; - int kk_end = Bp_copy[j+1]; - for(int kk = kk_start; kk < kk_end; kk++){ - int k = Bj_copy[kk]; //kth column of B in row j + for(int i = start_row; i < end_row; i++){ - sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i + int head = -2; + int length = 0; - if(next[k] == -1){ - next[k] = head; //keep a linked list, every element points to the next column index - head = k; - length++; - } - } - } + int jj_start = Ap_copy[i]; + int jj_end = Ap_copy[i+1]; - for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) + for(int jj = jj_start; jj < jj_end; jj++){ + int j = Aj_copy[jj]; + double v = Ax_copy[jj]; //value of A in (i,j) - if(sums[head] > lower_bound_inner){ //append the nonzero elements - candidate c; - c.index = head; - c.value = sums[head]; - temp_candidates.push_back(c); - } + int kk_start = Bp_copy[j]; + int kk_end = Bp_copy[j+1]; + for(int kk = kk_start; kk < kk_end; kk++){ + int k = Bj_copy[kk]; //kth column of B in row j - int temp = head; - head = next[head]; //iterate over columns + sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i - next[temp] = -1; //clear arrays - sums[temp] = 0; //clear arrays - } + if(next[k] == -1){ + next[k] = head; //keep a linked list, every element points to the next column index + head = k; + length++; + } + } + } - int len = (int)temp_candidates.size(); - if (len > ntop_inner){ - std::partial_sort(temp_candidates.begin(), - temp_candidates.begin()+ntop_inner, - temp_candidates.end(), - candidate_cmp); - len = ntop_inner; - } else { - std::sort(temp_candidates.begin(), - temp_candidates.end(), candidate_cmp); - } + for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) + if(sums[head] > lower_bound_inner){ //append the nonzero elements + candidate c; + c.index = head; + c.value = sums[head]; + temp_candidates.push_back(c); + } - temp_candidates.resize(len); - real_candidates[i] = temp_candidates; + int temp = head; + head = next[head]; //iterate over columns - temp_candidates.clear(); + next[temp] = -1; //clear arrays + sums[temp] = 0; //clear arrays + } -} + int len = (int)temp_candidates.size(); + if (len > ntop_inner){ + std::partial_sort(temp_candidates.begin(), + temp_candidates.begin()+ntop_inner, + temp_candidates.end(), + candidate_cmp); + len = ntop_inner; + } + else { + std::sort(temp_candidates.begin(), + temp_candidates.end(), candidate_cmp); + } + (*total) += len; + temp_candidates.resize(len); + real_candidates[i].swap(temp_candidates); + real_candidates[i].shrink_to_fit(); + } } -void sparse_dot_topn_parallel(int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], //data of A - int Bp[], - int Bj[], - double Bx[], //data of B - int ntop, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], - int n_jobs) +void sparse_dot_topn_parallel( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], //data of A + int Bp[], + int Bj[], + double Bx[], //data of B + int ntop, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int n_jobs +) { - - Cp[0] = 0; - - int split_amount = n_row / n_jobs; - - std::vector> split_row_vector(n_jobs); - std::vector> real_candidates(n_row); - std::vector *real_cand_pointer; real_cand_pointer = &real_candidates[0]; - std::vector thread_list(n_jobs); - - for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - std::vector temp_vector(2, 0); + std::vector> split_row_vector(n_jobs); + distribute_load(n_row, n_jobs, split_row_vector); - int start_split = job_nr * split_amount; - int end_split = start_split + split_amount; + // initialize aggregate: + std::vector sub_total(n_jobs, 0); - if (job_nr == n_jobs -1) { - end_split = n_row; - } + std::vector thread_list(n_jobs); + for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - temp_vector[0] = start_split; - temp_vector[1] = end_split; + int start_row = split_row_vector[job_nr][0]; + int end_row = split_row_vector[job_nr][1]; - split_row_vector[job_nr] = temp_vector; + thread_list[job_nr] = std::thread( + inner_sparse_dot_topn, + start_row, end_row, + n_col, ntop, + lower_bound, + Ap, Aj, Ax, Bp, Bj, Bx, + real_cand_pointer, + &sub_total[job_nr] + ); + } - } + for (int job_nr = 0; job_nr < n_jobs; job_nr++) + thread_list[job_nr].join(); + // gather the results: + std::vector start_points(n_jobs + 1); + start_points[0] = 0; + std::partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1); + Cp[0] = 0; for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - int start_row = split_row_vector[job_nr][0]; int end_row = split_row_vector[job_nr][1]; + thread_list[job_nr] = std::thread( + inner_gather_function, + start_row, end_row, + Cp, + start_points[job_nr], + Cj, + Cx, + real_cand_pointer + ); + } - thread_list[job_nr] = std::thread (inner_sparse_function, start_row, - end_row, n_col, ntop, lower_bound, - Ap, Aj, Ax, Bp, Bj, Bx, - real_cand_pointer); + for (int job_nr = 0; job_nr < n_jobs; job_nr++) + thread_list[job_nr].join(); - } +} - for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - thread_list[job_nr].join(); - } +void inner_sparse_dot_topn_extd( + int start_row, + int end_row, + int n_col_inner, + int ntop_inner, + double lower_bound_inner, + int Ap_copy[], + int Aj_copy[], + double Ax_copy[], + int Bp_copy[], + int Bj_copy[], + double Bx_copy[], + std::vector real_candidates[], + int* total, + int* n_minmax +) +{ - int nnz = 0; + std::vector next(n_col_inner,-1); + std::vector sums(n_col_inner, 0); - for (int m = 0; m < n_row; m++) { + std::vector temp_candidates; - std::vector cand = real_cand_pointer[m]; + int iterations_count = 0; - int can_len = (int)cand.size(); + for(int i = start_row; i < end_row; i++){ - for(int can_nr=0; can_nr < can_len; can_nr++){ - Cj[nnz] = cand[can_nr].index; - Cx[nnz] = cand[can_nr].value; - nnz++; - } + iterations_count += 1; - Cp[m+1] = nnz; + int head = -2; + int length = 0; - } + int jj_start = Ap_copy[i]; + int jj_end = Ap_copy[i+1]; -} + for(int jj = jj_start; jj < jj_end; jj++){ + int j = Aj_copy[jj]; + double v = Ax_copy[jj]; //value of A in (i,j) -void inner_sparse_minmax_function(int start_row, int end_row, int n_col_inner, - int ntop_inner, double lower_bound_inner, int Ap_copy[], - int Aj_copy[], double Ax_copy[], int Bp_copy[], int Bj_copy[], - double Bx_copy[], std::vector real_candidates[], - int *minmax_ntop) -{ + int kk_start = Bp_copy[j]; + int kk_end = Bp_copy[j+1]; + for(int kk = kk_start; kk < kk_end; kk++){ + int k = Bj_copy[kk]; //kth column of B in row j -std::vector next(n_col_inner,-1); -std::vector sums(n_col_inner, 0); + sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i -std::vector temp_candidates; + if(next[k] == -1){ + next[k] = head; //keep a linked list, every element points to the next column index + head = k; + length++; + } + } + } -int iterations_count = 0; + for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) -for(int i = start_row; i < end_row; i++){ + if(sums[head] > lower_bound_inner){ //append the nonzero elements + candidate c; + c.index = head; + c.value = sums[head]; + temp_candidates.push_back(c); + } - iterations_count += 1; + int temp = head; + head = next[head]; //iterate over columns - int head = -2; - int length = 0; + next[temp] = -1; //clear arrays + sums[temp] = 0; //clear arrays + } - int jj_start = Ap_copy[i]; - int jj_end = Ap_copy[i+1]; + int len = (int)temp_candidates.size(); + *n_minmax = (len > *n_minmax)? len : *n_minmax; + if (len > ntop_inner){ + std::partial_sort(temp_candidates.begin(), + temp_candidates.begin()+ntop_inner, + temp_candidates.end(), + candidate_cmp); + len = ntop_inner; + } + else { + std::sort(temp_candidates.begin(), + temp_candidates.end(), candidate_cmp); + } - for(int jj = jj_start; jj < jj_end; jj++){ - int j = Aj_copy[jj]; - double v = Ax_copy[jj]; //value of A in (i,j) + (*total) += len; + temp_candidates.resize(len); + real_candidates[i].swap(temp_candidates); + real_candidates[i].shrink_to_fit(); + } +} - int kk_start = Bp_copy[j]; - int kk_end = Bp_copy[j+1]; - for(int kk = kk_start; kk < kk_end; kk++){ - int k = Bj_copy[kk]; //kth column of B in row j +void sparse_dot_topn_extd_parallel( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], //data of A + int Bp[], + int Bj[], + double Bx[], //data of B + int ntop, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int *n_minmax, + int n_jobs +) +{ + std::vector> split_row_vector(n_jobs); + distribute_load(n_row, n_jobs, split_row_vector); - sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i + std::vector> real_candidates(n_row); + std::vector *real_cand_pointer; + real_cand_pointer = &real_candidates[0]; - if(next[k] == -1){ - next[k] = head; //keep a linked list, every element points to the next column index - head = k; - length++; - } - } - } - *minmax_ntop = (length > *minmax_ntop)? length : *minmax_ntop; + // initialize aggregates: + std::vector sub_total(n_jobs, 0); + std::vector split_n_minmax(n_jobs, 0); - for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) + std::vector thread_list(n_jobs); - if(sums[head] > lower_bound_inner){ //append the nonzero elements - candidate c; - c.index = head; - c.value = sums[head]; - temp_candidates.push_back(c); - } + for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - int temp = head; - head = next[head]; //iterate over columns + int start_row = split_row_vector[job_nr][0]; + int end_row = split_row_vector[job_nr][1]; - next[temp] = -1; //clear arrays - sums[temp] = 0; //clear arrays + thread_list[job_nr] = std::thread( + inner_sparse_dot_topn_extd, + start_row, end_row, + n_col, ntop, + lower_bound, + Ap, Aj, Ax, Bp, Bj, Bx, + real_cand_pointer, + &sub_total[job_nr], + &split_n_minmax[job_nr] + ); } - int len = (int)temp_candidates.size(); - if (len > ntop_inner){ - std::partial_sort(temp_candidates.begin(), - temp_candidates.begin()+ntop_inner, - temp_candidates.end(), - candidate_cmp); - len = ntop_inner; - } else { - std::sort(temp_candidates.begin(), - temp_candidates.end(), candidate_cmp); - } + for (int job_nr = 0; job_nr < n_jobs; job_nr++) + thread_list[job_nr].join(); + // gather the results: + *n_minmax = *std::max_element(split_n_minmax.begin(), split_n_minmax.end()); - temp_candidates.resize(len); - real_candidates[i] = temp_candidates; + std::vector start_points(n_jobs + 1); + start_points[0] = 0; + std::partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1); + + Cp[0] = 0; + for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - temp_candidates.clear(); + int start_row = split_row_vector[job_nr][0]; + int end_row = split_row_vector[job_nr][1]; -} + thread_list[job_nr] = std::thread( + inner_gather_function, + start_row, end_row, + Cp, + start_points[job_nr], + Cj, + Cx, + real_cand_pointer + ); + } + + for (int job_nr = 0; job_nr < n_jobs; job_nr++) + thread_list[job_nr].join(); } -void sparse_dot_plus_minmax_topn_parallel(int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], //data of A - int Bp[], - int Bj[], - double Bx[], //data of B - int ntop, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], - int *minmax_ntop, - int n_jobs) +void inner_sparse_dot_free( + int start_row, + int end_row, + int n_col_inner, + double lower_bound_inner, + int Ap_copy[], + int Aj_copy[], + double Ax_copy[], + int Bp_copy[], + int Bj_copy[], + double Bx_copy[], + std::vector real_candidates[], + int* total, + int* n_minmax +) { - Cp[0] = 0; + std::vector next(n_col_inner,-1); + std::vector sums(n_col_inner, 0); - int split_amount = n_row / n_jobs; + std::vector temp_candidates; - std::vector> split_row_vector(n_jobs); + for(int i = start_row; i < end_row; i++){ - std::vector> real_candidates(n_row); + int head = -2; + int length = 0; - std::vector *real_cand_pointer; - real_cand_pointer = &real_candidates[0]; + int jj_start = Ap_copy[i]; + int jj_end = Ap_copy[i+1]; - std::vector split_minmax_ntop(n_jobs, 0); + for(int jj = jj_start; jj < jj_end; jj++){ + int j = Aj_copy[jj]; + double v = Ax_copy[jj]; //value of A in (i,j) - std::vector thread_list(n_jobs); + int kk_start = Bp_copy[j]; + int kk_end = Bp_copy[j+1]; + for(int kk = kk_start; kk < kk_end; kk++){ + int k = Bj_copy[kk]; //kth column of B in row j + sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i - for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - std::vector temp_vector(2, 0); + if(next[k] == -1){ + next[k] = head; //keep a linked list, every element points to the next column index + head = k; + length++; + } + } + } - int start_split = job_nr * split_amount; - int end_split = start_split + split_amount; + for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) - if (job_nr == n_jobs -1) { - end_split = n_row; - } + if(sums[head] > lower_bound_inner){ //append the nonzero elements + candidate c; + c.index = head; + c.value = sums[head]; + temp_candidates.push_back(c); + } - temp_vector[0] = start_split; - temp_vector[1] = end_split; + int temp = head; + head = next[head]; //iterate over columns - split_row_vector[job_nr] = temp_vector; + next[temp] = -1; //clear arrays + sums[temp] = 0; //clear arrays + } - } + std::sort(temp_candidates.begin(), + temp_candidates.end(), candidate_cmp); - for (int job_nr = 0; job_nr < n_jobs; job_nr++) { + int len = (int) temp_candidates.size(); + (*total) += len; + *n_minmax = (len > *n_minmax)? len : *n_minmax; + real_candidates[i].swap(temp_candidates); + real_candidates[i].shrink_to_fit(); + } +} +void sparse_dot_free_parallel( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], //data of A + int Bp[], + int Bj[], + double Bx[], //data of B + double lower_bound, + int Cp[], + std::vector* vCj, + std::vector* vCx, + int* n_minmax, + int n_jobs +) +{ + std::vector> split_row_vector(n_jobs); + distribute_load(n_row, n_jobs, split_row_vector); - int start_row = split_row_vector[job_nr][0]; - int end_row = split_row_vector[job_nr][1]; + std::vector> real_candidates(n_row); + std::vector *real_cand_pointer; + real_cand_pointer = &real_candidates[0]; + // initialize aggregates: + std::vector sub_total(n_jobs, 0); + std::vector split_n_minmax(n_jobs, 0); - thread_list[job_nr] = std::thread (inner_sparse_minmax_function, start_row, - end_row, n_col, ntop, lower_bound, - Ap, Aj, Ax, Bp, Bj, Bx, - real_cand_pointer, - &split_minmax_ntop[job_nr]); + // execute the jobs: + std::vector thread_list(n_jobs); + for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - } + int start_row = split_row_vector[job_nr][0]; + int end_row = split_row_vector[job_nr][1]; - for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - thread_list[job_nr].join(); + thread_list[job_nr] = std::thread ( + inner_sparse_dot_free, + start_row, end_row, + n_col, + lower_bound, + Ap, Aj, Ax, Bp, Bj, Bx, + real_cand_pointer, + &sub_total[job_nr], + &split_n_minmax[job_nr] + ); } - int nnz = 0; + for (int job_nr = 0; job_nr < n_jobs; job_nr++) + thread_list[job_nr].join(); - for (int m = 0; m < n_row; m++) { + // gather the results (in parallel): + *n_minmax = *std::max_element(split_n_minmax.begin(), split_n_minmax.end()); - std::vector cand = real_cand_pointer[m]; + std::vector start_points(n_jobs + 1); + start_points[0] = 0; + std::partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1); - int can_len = (int)cand.size(); + int total = start_points.back(); + vCj->resize(total); + vCx->resize(total); - for(int can_nr=0; can_nr < can_len; can_nr++){ - Cj[nnz] = cand[can_nr].index; - Cx[nnz] = cand[can_nr].value; - nnz++; - } + Cp[0] = 0; + for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - Cp[m+1] = nnz; + int start_row = split_row_vector[job_nr][0]; + int end_row = split_row_vector[job_nr][1]; + thread_list[job_nr] = std::thread( + inner_gather_function, + start_row, end_row, + Cp, + start_points[job_nr], + &((*vCj)[0]), + &((*vCx)[0]), + real_cand_pointer + ); } - *minmax_ntop = *std::max_element(split_minmax_ntop.begin(), split_minmax_ntop.end()); + + for (int job_nr = 0; job_nr < n_jobs; job_nr++) + thread_list[job_nr].join(); } -void inner_sparse_only_minmax_function(int start_row, int end_row, int n_col_inner, - int Ap_copy[], int Aj_copy[], - int Bp_copy[], int Bj_copy[], - int *minmax_ntop) +void inner_sparse_only_max_nnz_col( + int start_row, + int end_row, + int n_col_inner, + int Ap_copy[], + int Aj_copy[], + int Bp_copy[], + int Bj_copy[], + int *max_nnz_col // already initialized to 0 +) { std::vector unmarked(n_col_inner, true); @@ -396,55 +604,44 @@ void inner_sparse_only_minmax_function(int start_row, int end_row, int n_col_inn } } } - *minmax_ntop = (length > *minmax_ntop)? length : *minmax_ntop; + *max_nnz_col = (length > *max_nnz_col)? length : *max_nnz_col; } } -void sparse_dot_only_minmax_topn_parallel(int n_row, - int n_col, - int Ap[], - int Aj[], - int Bp[], - int Bj[], - int *minmax_ntop, - int n_jobs) +void sparse_dot_only_max_nnz_col_parallel( + int n_row, + int n_col, + int Ap[], + int Aj[], + int Bp[], + int Bj[], + int *max_nnz_col, + int n_jobs +) { - std::vector job_load_sz(n_jobs, n_row/n_jobs); - - int rem = n_row % n_jobs; - for (int r = 0; r < rem; r++) job_load_sz[r] += 1; - std::vector> split_row_vector(n_jobs); + distribute_load(n_row, n_jobs, split_row_vector); - std::vector split_minmax_ntop(n_jobs, 0); - + std::vector split_max_nnz_col(n_jobs, 0); std::vector thread_list(n_jobs); - - int start = 0; - for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - std::vector temp_vector(2, 0); - - temp_vector[0] = start; - temp_vector[1] = start + job_load_sz[job_nr]; - start = temp_vector[1]; - - split_row_vector[job_nr] = temp_vector; - } - for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - int start_row = split_row_vector[job_nr][0]; int end_row = split_row_vector[job_nr][1]; - thread_list[job_nr] = std::thread (inner_sparse_only_minmax_function, - start_row, end_row, n_col, - Ap, Aj, Bp, Bj, - &split_minmax_ntop[job_nr]); + thread_list[job_nr] = std::thread ( + inner_sparse_only_max_nnz_col, + start_row, end_row, + n_col, + Ap, Aj, Bp, Bj, + &split_max_nnz_col[job_nr] + ); } - for (int job_nr = 0; job_nr < n_jobs; job_nr++) thread_list[job_nr].join(); + for (int job_nr = 0; job_nr < n_jobs; job_nr++) + thread_list[job_nr].join(); - *minmax_ntop = *std::max_element(split_minmax_ntop.begin(), split_minmax_ntop.end()); + *max_nnz_col = *std::max_element(split_max_nnz_col.begin(), split_max_nnz_col.end()); } + diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.h b/sparse_dot_topn/sparse_dot_topn_parallel.h index cb43cd1c..30dc24ef 100644 --- a/sparse_dot_topn/sparse_dot_topn_parallel.h +++ b/sparse_dot_topn/sparse_dot_topn_parallel.h @@ -23,44 +23,67 @@ #ifndef UTILS_CPPCLASS_H #define UTILS_CPPCLASS_H -extern void sparse_dot_topn_parallel(int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int ntop, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], - int n_jobs); +extern void sparse_dot_topn_parallel( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int ntop, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int n_jobs +); -extern void sparse_dot_plus_minmax_topn_parallel(int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int ntop, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], - int* minmax_topn, - int n_jobs); +extern void sparse_dot_topn_extd_parallel( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int ntop, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int* n_minmax, + int n_jobs +); -extern void sparse_dot_only_minmax_topn_parallel(int n_row, - int n_col, - int Ap[], - int Aj[], - int Bp[], - int Bj[], - int *minmax_ntop, - int n_jobs); +extern void sparse_dot_free_parallel( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], //data of A + int Bp[], + int Bj[], + double Bx[], //data of B + double lower_bound, + int Cp[], + std::vector* Cj, + std::vector* Cx, + int* n_minmax, + int njobs +); + +extern void sparse_dot_only_max_nnz_col_parallel( + int n_row, + int n_col, + int Ap[], + int Aj[], + int Bp[], + int Bj[], + int *max_nnz_col, + int n_jobs +); #endif //UTILS_CPPCLASS_H diff --git a/sparse_dot_topn/sparse_dot_topn_source.cpp b/sparse_dot_topn/sparse_dot_topn_source.cpp index c4544790..88abbd6a 100644 --- a/sparse_dot_topn/sparse_dot_topn_source.cpp +++ b/sparse_dot_topn/sparse_dot_topn_source.cpp @@ -49,19 +49,21 @@ bool candidate_cmp(candidate c_i, candidate c_j) { return (c_i.value > c_j.value N.B. A and B must be CSR format!!! */ -void sparse_dot_topn_source(int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], //data of A - int Bp[], - int Bj[], - double Bx[], //data of B - int ntop, - double lower_bound, - int Cp[], - int Cj[], - double Cx[]) +void sparse_dot_topn_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], //data of A + int Bp[], + int Bj[], + double Bx[], //data of B + int ntop, + double lower_bound, + int Cp[], + int Cj[], + double Cx[] +) { std::vector next(n_col,-1); std::vector sums(n_col, 0); @@ -133,10 +135,12 @@ void sparse_dot_topn_source(int n_row, } /* - C++ implementation of sparse_dot_source + C++ implementation of sparse_dot_topn_extd_source This function will return a matrix C in CSR format, where - C = [all results > lower_bound sorted for each row of A * B]. + C = [sorted top n results > lower_bound for each row of A * B]. + The maximum number n_minmax of elements per row of C (assuming ntop = n_col) + is also returned. Input: n_row: number of rows of A matrix @@ -145,37 +149,41 @@ void sparse_dot_topn_source(int n_row, Ap, Aj, Ax: CSR expression of A matrix Bp, Bj, Bx: CSR expression of B matrix - memory_bound: the maximum number of elements per row of C + ntop: n top results lower_bound: a threshold that the element of A*B must greater than Output by reference: Cp, Cj, Cx: CSR expression of C matrix + n_minmax: The maximum number of elements per row of C (assuming ntop = n_col) N.B. A and B must be CSR format!!! */ -void sparse_dot_source(int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], //data of A - int Bp[], - int Bj[], - double Bx[], //data of B - int memory_bound, - double lower_bound, - int Cp[], - int Cj[], - double Cx[]) +void sparse_dot_topn_extd_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], //data of A + int Bp[], + int Bj[], + double Bx[], //data of B + int ntop, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], //data of C + int* n_minmax +) { std::vector next(n_col,-1); std::vector sums(n_col, 0); std::vector candidates; - candidates.reserve(memory_bound); int nnz = 0; Cp[0] = 0; + *n_minmax = 0; for(int i = 0; i < n_row; i++){ int head = -2; @@ -219,7 +227,13 @@ void sparse_dot_source(int n_row, } int len = (int)candidates.size(); - std::sort(candidates.begin(), candidates.end(), candidate_cmp); + *n_minmax = (len > *n_minmax)? len : *n_minmax; + if (len > ntop){ + std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp); + len = ntop; + } else { + std::sort(candidates.begin(), candidates.end(), candidate_cmp); + } for(int a=0; a < len; a++){ Cj[nnz] = candidates[a].index; @@ -237,6 +251,7 @@ void sparse_dot_source(int n_row, This function will return a matrix C in CSR format, where C = [all results > lower_bound sorted for each row of A * B]. + It also returns the maximum number of elements per row of C. Input: n_row: number of rows of A matrix @@ -250,24 +265,29 @@ void sparse_dot_source(int n_row, Output by reference: Cp: C array for idx_pointer of CSR expression of C matrix - Cj: numpy array for indices of CSR expression of C matrix - Cx: numpy array for data values of CSR expression of C matrix + Cj: STL vector for indices of CSR expression of C matrix + Cx: STL vector for data values of CSR expression of C matrix + n_minmax: the maximum number of elements per row of C N.B. A and B must be CSR format!!! */ -void sparse_dot_free_source(int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], //data of A - int Bp[], - int Bj[], - double Bx[], //data of B - double lower_bound, - int Cp[], - std::vector* Cj, - std::vector* Cx) +void sparse_dot_free_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], //data of A + int Bp[], + int Bj[], + double Bx[], //data of B + double lower_bound, + int Cp[], + std::vector* Cj, + std::vector* Cx, + int* n_minmax +) { + *n_minmax = 0; int sz = std::max(n_row, n_col); Cj->reserve(sz); Cx->reserve(sz); @@ -321,6 +341,7 @@ void sparse_dot_free_source(int n_row, } int len = (int)candidates.size(); + *n_minmax = (len > *n_minmax)? len : *n_minmax; std::sort(candidates.begin(), candidates.end(), candidate_cmp); for(int a=0; a < len; a++){ @@ -358,17 +379,19 @@ void sparse_dot_free_source(int n_row, N.B. A and B must be CSR format!!! */ -void sparse_dot_nnz_source(int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], //data of A - int Bp[], - int Bj[], - double Bx[], //data of B - double lower_bound, - int* nnz, - int* ntop) +void sparse_dot_nnz_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], //data of A + int Bp[], + int Bj[], + double Bx[], //data of B + double lower_bound, + int* nnz, + int* ntop +) { std::vector next(n_col,-1); std::vector sums(n_col, 0); @@ -418,7 +441,7 @@ void sparse_dot_nnz_source(int n_row, } /* - C++ implementation of sparse_dot_only_minmax_topn_source + C++ implementation of sparse_dot_only_max_nnz_col_source This function will return the maximum number of columns set per row over all rows of A * B @@ -431,22 +454,24 @@ void sparse_dot_nnz_source(int n_row, Bp, Bj, Bx: CSR expression of B matrix Output by reference: - minmax_ntop: the maximum number of columns set per row + max_nnz_col: the maximum number of columns set per row over all rows of A * B N.B. A and B must be CSR format!!! */ -void sparse_dot_only_minmax_topn_source(int n_row, - int n_col, - int Ap[], - int Aj[], - int Bp[], - int Bj[], - int *minmax_ntop) +void sparse_dot_only_max_nnz_col_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + int Bp[], + int Bj[], + int *max_nnz_col +) { std::vector unmarked(n_col, true); - *minmax_ntop = 0; + *max_nnz_col = 0; for(int i = 0; i < n_row; i++){ int length = 0; @@ -467,6 +492,6 @@ void sparse_dot_only_minmax_topn_source(int n_row, } } } - *minmax_ntop = (length > *minmax_ntop)? length : *minmax_ntop; + *max_nnz_col = (length > *max_nnz_col)? length : *max_nnz_col; } } diff --git a/sparse_dot_topn/sparse_dot_topn_source.h b/sparse_dot_topn/sparse_dot_topn_source.h index 664378e3..723e9acc 100644 --- a/sparse_dot_topn/sparse_dot_topn_source.h +++ b/sparse_dot_topn/sparse_dot_topn_source.h @@ -28,21 +28,41 @@ struct candidate {int index; double value;}; extern bool candidate_cmp(candidate c_i, candidate c_j); -extern void sparse_dot_topn_source(int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], //data of A - int Bp[], - int Bj[], - double Bx[], //data of B - int ntop, - double lower_bound, - int Cp[], - int Cj[], - double Cx[]); //data of C +extern void sparse_dot_topn_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], //data of A + int Bp[], + int Bj[], + double Bx[], //data of B + int ntop, + double lower_bound, + int Cp[], + int Cj[], + double Cx[] //data of C +); + +extern void sparse_dot_topn_extd_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], //data of A + int Bp[], + int Bj[], + double Bx[], //data of B + int ntop, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], //data of C + int* n_minmax +); -extern void sparse_dot_free_source(int n_row, +extern void sparse_dot_free_source( + int n_row, int n_col, int Ap[], int Aj[], @@ -53,14 +73,18 @@ extern void sparse_dot_free_source(int n_row, double lower_bound, int Cp[], std::vector* Cj, - std::vector* Cx); + std::vector* Cx, + int* n_minmax +); -extern void sparse_dot_only_minmax_topn_source(int n_row, - int n_col, - int Ap[], - int Aj[], - int Bp[], - int Bj[], - int *minmax_ntop); +extern void sparse_dot_only_max_nnz_col_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + int Bp[], + int Bj[], + int *max_nnz_col +); #endif //UTILS_CPPCLASS_H diff --git a/sparse_dot_topn/sparse_dot_topn_threaded.pyx b/sparse_dot_topn/sparse_dot_topn_threaded.pyx index 0bb45a6a..86c347ec 100644 --- a/sparse_dot_topn/sparse_dot_topn_threaded.pyx +++ b/sparse_dot_topn/sparse_dot_topn_threaded.pyx @@ -19,70 +19,97 @@ # distutils: language = c++ -import numpy as np +from libcpp.vector cimport vector +from array_wrappers cimport ArrayWrapper_int, ArrayWrapper_double + cimport numpy as np +import numpy as np + + +np.import_array() + cdef extern from "sparse_dot_topn_parallel.h": cdef void sparse_dot_topn_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int topn, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], - int n_jobs); - - cdef void sparse_dot_plus_minmax_topn_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int topn, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], - int minmax_ntop[], - int n_jobs); - - cdef void sparse_dot_only_minmax_topn_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - int Bp[], - int Bj[], - int minmax_ntop[], - int n_jobs); + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int topn, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int n_jobs + ); + + cdef void sparse_dot_topn_extd_parallel( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int topn, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int* n_minmax, + int n_jobs + ); + + cdef void sparse_dot_free_parallel( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + double lower_bound, + int Cp[], + vector[int]* Cj, + vector[double]* Cx, + int* n_minmax, + int n_jobs + ); + + cdef void sparse_dot_only_max_nnz_col_parallel( + int n_row, + int n_col, + int Ap[], + int Aj[], + int Bp[], + int Bj[], + int* max_nnz_col, + int n_jobs + ); cpdef sparse_dot_topn_threaded( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr, - np.ndarray[int, ndim=1] c_indices, - np.ndarray[double, ndim=1] c_data, - int n_jobs - ): + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr, + np.ndarray[int, ndim=1] c_indices, + np.ndarray[double, ndim=1] c_data, + int n_jobs + ): cdef int* Ap = &a_indptr[0] cdef int* Aj = &a_indices[0] @@ -98,23 +125,23 @@ cpdef sparse_dot_topn_threaded( lower_bound, Cp, Cj, Cx, n_jobs) return -cpdef sparse_dot_plus_minmax_topn_threaded( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr, - np.ndarray[int, ndim=1] c_indices, - np.ndarray[double, ndim=1] c_data, - np.ndarray[int, ndim=1] minmax_ntop, - int n_jobs - ): +cpdef sparse_dot_topn_extd_threaded( + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr, + np.ndarray[int, ndim=1] c_indices, + np.ndarray[double, ndim=1] c_data, + np.ndarray[int, ndim=1] nminmax, + int n_jobs + ): cdef int* Ap = &a_indptr[0] cdef int* Aj = &a_indices[0] @@ -125,28 +152,62 @@ cpdef sparse_dot_plus_minmax_topn_threaded( cdef int* Cp = &c_indptr[0] cdef int* Cj = &c_indices[0] cdef double* Cx = &c_data[0] - cdef int* o_minmax_ntop = &minmax_ntop[0] + cdef int* n_minmax = &nminmax[0] - sparse_dot_plus_minmax_topn_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, - lower_bound, Cp, Cj, Cx, o_minmax_ntop, n_jobs) + sparse_dot_topn_extd_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, + lower_bound, Cp, Cj, Cx, n_minmax, n_jobs) return -cpdef sparse_dot_only_minmax_topn_threaded( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[int, ndim=1] minmax_ntop, - int n_jobs - ): +cpdef sparse_dot_free_threaded( + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr, + int n_jobs + ): + + cdef int* Ap = &a_indptr[0] + cdef int* Aj = &a_indices[0] + cdef double* Ax = &a_data[0] + cdef int* Bp = &b_indptr[0] + cdef int* Bj = &b_indices[0] + cdef double* Bx = &b_data[0] + cdef int* Cp = &c_indptr[0] + cdef np.ndarray[int, ndim=1] nminmax = np.array([0], dtype=np.int32) + cdef int* n_minmax = &nminmax[0] + + cdef vector[int] vCj; + cdef vector[double] vCx; + + sparse_dot_free_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx, n_minmax, n_jobs) + + c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0) + c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0) + + return c_indices, c_data, nminmax[0] + +cpdef sparse_dot_only_max_nnz_col_threaded( + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[int, ndim=1] max_nnz_col, + int n_jobs + ): cdef int* Ap = &a_indptr[0] cdef int* Aj = &a_indices[0] cdef int* Bp = &b_indptr[0] cdef int* Bj = &b_indices[0] - cdef int* o_minmax_ntop = &minmax_ntop[0] + cdef int* o_max_nnz_col = &max_nnz_col[0] - sparse_dot_only_minmax_topn_parallel(n_row, n_col, Ap, Aj, Bp, Bj, o_minmax_ntop, n_jobs) + sparse_dot_only_max_nnz_col_parallel(n_row, n_col, Ap, Aj, Bp, Bj, o_max_nnz_col, n_jobs) return diff --git a/sparse_dot_topn/test/test_awesome_cossim_topn.py b/sparse_dot_topn/test/test_awesome_cossim_topn.py new file mode 100644 index 00000000..fb0d67ab --- /dev/null +++ b/sparse_dot_topn/test/test_awesome_cossim_topn.py @@ -0,0 +1,346 @@ +# -*- coding: utf-8 -*- + +from sparse_dot_topn import awesome_cossim_topn +from scipy.sparse.csr import csr_matrix +from scipy.sparse import coo_matrix +from scipy.sparse import rand +import numpy as np +import pandas as pd +import multiprocessing +import pytest + +PRUNE_THRESHOLD = 0.1 +NUM_CANDIDATES = 3 +MEM_MANAGER_IS_C = True +USE_THREADS = True +MAX_N_PROCESSES = min(8, multiprocessing.cpu_count()) - 1 + + +def get_n_top_sparse(mat, n_top=10): + """ + Get list of (index, value) of the n largest elements in a 1-dimensional sparse matrix + + :param mat: input sparse matrix + :param n_top: number of largest elements, default is 10. + :return: sorted list of largest elements + """ + length = mat.getnnz() + if length == 0: + return None + if length <= n_top: + result = list(zip(mat.indices, mat.data)) + else: + arg_idx = np.argpartition(mat.data, -n_top)[-n_top:] + result = list(zip(mat.indices[arg_idx], mat.data[arg_idx])) + return sorted(result, key=lambda x: -x[1]) + + +def helper_awesome_cossim_topn_dense( + a_dense, + b_dense, + mem_manager_is_C=False, + use_threads=False, + n_jobs=1 + ): + dense_result = np.dot(a_dense, np.transpose(b_dense)) # dot product + sparse_result = csr_matrix(dense_result) + sparse_result_top3 = [get_n_top_sparse(row, NUM_CANDIDATES) + for row in sparse_result] # get ntop using the old method + + pruned_dense_result = dense_result.copy() + pruned_dense_result[pruned_dense_result < PRUNE_THRESHOLD] = 0 # prune low similarity + pruned_sparse_result = csr_matrix(pruned_dense_result) + pruned_sparse_result_top3 = [get_n_top_sparse(row, NUM_CANDIDATES) for row in pruned_sparse_result] + + a_csr = csr_matrix(a_dense) + b_csr_t = csr_matrix(b_dense).T + + awesome_result = awesome_cossim_topn( + a_csr, b_csr_t, len(b_dense), + 0.0, + mem_manager_is_C=mem_manager_is_C, + use_threads=use_threads, + n_jobs=n_jobs + ) + awesome_result_top3 = \ + awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, 0.0, use_threads=use_threads, n_jobs=n_jobs) + awesome_result_top3 = [list(zip(row.indices, row.data)) if len( + row.data) > 0 else None for row in awesome_result_top3] # make comparable, normally not needed + + pruned_awesome_result = awesome_cossim_topn( + a_csr, + b_csr_t, + len(b_dense), + PRUNE_THRESHOLD, + mem_manager_is_C=mem_manager_is_C, + use_threads=use_threads, + n_jobs=n_jobs + ) + pruned_awesome_result_top3 = \ + awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, PRUNE_THRESHOLD, use_threads=use_threads, n_jobs=n_jobs) + pruned_awesome_result_top3 = [list(zip(row.indices, row.data)) if len( + row.data) > 0 else None for row in pruned_awesome_result_top3] + + # no candidate selection, no pruning + assert awesome_result.nnz == sparse_result.nnz + # no candidate selection, below PRUNE_THRESHOLD similarity pruned + assert pruned_awesome_result.nnz == pruned_sparse_result.nnz + + all_none1 = np.all(pd.isnull(awesome_result_top3)) and np.all(pd.isnull(sparse_result_top3)) + all_none2 = np.all(pd.isnull(pruned_awesome_result_top3)) and np.all(pd.isnull(pruned_sparse_result_top3)) + + # top NUM_CANDIDATES candidates selected, no pruning + if not all_none1: + np.testing.assert_array_almost_equal(awesome_result_top3, sparse_result_top3) + else: + assert len(awesome_result_top3) == len(sparse_result_top3) + # top NUM_CANDIDATES candidates selected, below PRUNE_THRESHOLD similarity pruned + if not all_none2: + np.testing.assert_array_almost_equal(pruned_awesome_result_top3, pruned_sparse_result_top3) + else: + assert len(pruned_awesome_result_top3) == len(pruned_sparse_result_top3) + + +def helper_awesome_cossim_topn_sparse( + a_sparse, + b_sparse, + flag=True, + mem_manager_is_C=False, + use_threads=False, + n_jobs=1 + ): + # Note: helper function using awesome_cossim_topn + sparse_result = a_sparse.dot(b_sparse.T) # dot product + sparse_result_top3 = [get_n_top_sparse(row, NUM_CANDIDATES) + for row in sparse_result] # get ntop using the old method + + pruned_sparse_result = sparse_result.copy() + pruned_sparse_result[pruned_sparse_result < PRUNE_THRESHOLD] = 0 # prune low similarity + pruned_sparse_result.eliminate_zeros() + pruned_sparse_result_top3 = [get_n_top_sparse(row, NUM_CANDIDATES) for row in pruned_sparse_result] + + a_csr = csr_matrix(a_sparse) + b_csr_t = csr_matrix(b_sparse).T + + awesome_result = awesome_cossim_topn( + a_csr, + b_csr_t, + b_sparse.shape[0], + 0.0, + mem_manager_is_C=mem_manager_is_C, + use_threads=use_threads, + n_jobs=n_jobs + ) + awesome_result_top3 = \ + awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, 0.0, use_threads=use_threads, n_jobs=n_jobs) + awesome_result_top3 = [list(zip(row.indices, row.data)) if len( + row.data) > 0 else None for row in awesome_result_top3] # make comparable, normally not needed + + pruned_awesome_result = awesome_cossim_topn( + a_csr, + b_csr_t, + b_sparse.shape[0], + PRUNE_THRESHOLD, + mem_manager_is_C=mem_manager_is_C, + use_threads=use_threads, + n_jobs=n_jobs + ) + pruned_awesome_result_top3 = \ + awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, PRUNE_THRESHOLD, use_threads=use_threads, n_jobs=n_jobs) + pruned_awesome_result_top3 = [list(zip(row.indices, row.data)) if len( + row.data) > 0 else None for row in pruned_awesome_result_top3] + + # no candidate selection, no pruning + assert awesome_result.nnz == sparse_result.nnz + # no candidate selection, below PRUNE_THRESHOLD similarity pruned + assert pruned_awesome_result.nnz == pruned_sparse_result.nnz + + if flag: + all_none1 = np.all(pd.isnull(awesome_result_top3)) and np.all(pd.isnull(sparse_result_top3)) + all_none2 = np.all(pd.isnull(pruned_awesome_result_top3)) and np.all(pd.isnull(pruned_sparse_result_top3)) + + # top NUM_CANDIDATES candidates selected, no pruning + if not all_none1: + np.testing.assert_array_almost_equal(awesome_result_top3, sparse_result_top3) + else: + assert len(awesome_result_top3) == len(sparse_result_top3) + # top NUM_CANDIDATES candidates selected, below PRUNE_THRESHOLD similarity pruned + if not all_none2: + np.testing.assert_array_almost_equal(pruned_awesome_result_top3, pruned_sparse_result_top3) + else: + assert len(pruned_awesome_result_top3) == len(pruned_sparse_result_top3) + else: + assert awesome_result_top3 == sparse_result_top3 + assert pruned_awesome_result_top3 == pruned_sparse_result_top3 + + +def test_awesome_cossim_topn_manually(): + # a simple case + a_dense = [[0.2, 0.1, 0.0, 0.9, 0.3], + [0.7, 0.0, 0.0, 0.2, 0.2], + [0.0, 0.0, 0.0, 0.2, 0.1], + [0.5, 0.4, 0.5, 0.0, 0.0]] + + b_dense = [[0.4, 0.2, 0.3, 0.2, 0.7], + [0.9, 0.4, 0.5, 0.1, 0.4], + [0.3, 0.8, 0.0, 0.2, 0.5], + [0.3, 0.0, 0.1, 0.1, 0.6], + [0.6, 0.1, 0.2, 0.8, 0.1], + [0.9, 0.1, 0.6, 0.4, 0.3]] + helper_awesome_cossim_topn_dense(a_dense, b_dense) + helper_awesome_cossim_topn_dense(a_dense, b_dense, mem_manager_is_C=MEM_MANAGER_IS_C) + for process in range(MAX_N_PROCESSES): + n_jobs = process + 1 + helper_awesome_cossim_topn_dense(a_dense, b_dense, use_threads=USE_THREADS, n_jobs=n_jobs) + helper_awesome_cossim_topn_dense( + a_dense, + b_dense, + mem_manager_is_C=MEM_MANAGER_IS_C, + use_threads=USE_THREADS, + n_jobs=n_jobs + ) + + # boundary checking, there is no matching at all in this case + c_dense = [[0.2, 0.1, 0.3, 0, 0], + [0.7, 0.2, 0.7, 0, 0], + [0.3, 0.9, 0.6, 0, 0], + [0.5, 0.4, 0.5, 0, 0]] + d_dense = [[0, 0, 0, 0.6, 0.9], + [0, 0, 0, 0.1, 0.1], + [0, 0, 0, 0.2, 0.6], + [0, 0, 0, 0.8, 0.4], + [0, 0, 0, 0.1, 0.3], + [0, 0, 0, 0.7, 0.5]] + helper_awesome_cossim_topn_dense(c_dense, d_dense) + helper_awesome_cossim_topn_dense(c_dense, d_dense, mem_manager_is_C=MEM_MANAGER_IS_C) + for process in range(MAX_N_PROCESSES): + n_jobs = process + 1 + helper_awesome_cossim_topn_dense(c_dense, d_dense, use_threads=USE_THREADS, n_jobs=n_jobs) + helper_awesome_cossim_topn_dense( + c_dense, + d_dense, + mem_manager_is_C=MEM_MANAGER_IS_C, + use_threads=USE_THREADS, + n_jobs=n_jobs + ) + + +@pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero") +@pytest.mark.filterwarnings("ignore:Changing the sparsity structure of a csr_matrix is expensive") +def test_awesome_cossim_top_one_zeros(): + # test with one row matrix with all zeros + # helper_awesome_cossim_top_sparse uses a local function awesome_cossim_top + nr_vocab = 1000 + density = 0.1 + for _ in range(3): + a_sparse = csr_matrix(np.zeros((1, nr_vocab))) + b_sparse = rand(800, nr_vocab, density=density, format='csr') + helper_awesome_cossim_topn_sparse(a_sparse, b_sparse) + helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, mem_manager_is_C=MEM_MANAGER_IS_C) + for process in range(MAX_N_PROCESSES): + n_jobs = process + 1 + helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, use_threads=USE_THREADS, n_jobs=n_jobs) + helper_awesome_cossim_topn_sparse( + a_sparse, + b_sparse, + mem_manager_is_C=MEM_MANAGER_IS_C, + use_threads=USE_THREADS, + n_jobs=n_jobs + ) + + +@pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero") +@pytest.mark.filterwarnings("ignore:Changing the sparsity structure of a csr_matrix is expensive") +def test_awesome_cossim_top_all_zeros(): + # test with all zeros matrix + # helper_awesome_cossim_top_sparse uses a local function awesome_cossim_top + nr_vocab = 1000 + density = 0.1 + for _ in range(3): + a_sparse = csr_matrix(np.zeros((2, nr_vocab))) + b_sparse = rand(800, nr_vocab, density=density, format='csr') + helper_awesome_cossim_topn_sparse(a_sparse, b_sparse) + helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, mem_manager_is_C=MEM_MANAGER_IS_C) + for process in range(MAX_N_PROCESSES): + n_jobs = process + 1 + helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, use_threads=USE_THREADS, n_jobs=n_jobs) + helper_awesome_cossim_topn_sparse( + a_sparse, + b_sparse, + mem_manager_is_C=MEM_MANAGER_IS_C, + use_threads=USE_THREADS, + n_jobs=n_jobs + ) + + +@pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero") +@pytest.mark.filterwarnings("ignore:Changing the sparsity structure of a csr_matrix is expensive") +def test_awesome_cossim_top_small_matrix(): + # test with small matrix + nr_vocab = 1000 + density = 0.1 + for _ in range(10): + a_sparse = rand(300, nr_vocab, density=density, format='csr') + b_sparse = rand(800, nr_vocab, density=density, format='csr') + helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False) + helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, mem_manager_is_C=MEM_MANAGER_IS_C) + for process in range(MAX_N_PROCESSES): + n_jobs = process + 1 + helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, use_threads=USE_THREADS, n_jobs=n_jobs) + helper_awesome_cossim_topn_sparse( + a_sparse, + b_sparse, + False, + mem_manager_is_C=MEM_MANAGER_IS_C, + use_threads=USE_THREADS, + n_jobs=n_jobs + ) + + +@pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero") +@pytest.mark.filterwarnings("ignore:Changing the sparsity structure of a csr_matrix is expensive") +def test_awesome_cossim_top_large_matrix(): + # MB: I reduced the size of the matrix so the test also runs in small memory. + # test with large matrix + nr_vocab = 2 << 24 + density = 1e-6 + n_samples = 10000 + nnz = int(n_samples * nr_vocab * density) + + rng1 = np.random.RandomState(42) + rng2 = np.random.RandomState(43) + + for _ in range(1): + # scipy.sparse.rand has very high memory usage + # see for details: https://github.com/scipy/scipy/issues/9699 + # a_sparse = rand(500, nr_vocab, density=density, format='csr') + # b_sparse = rand(80000, nr_vocab, density=density, format='csr') + + # switching to alternative random method below, which is also a lot faster + row = rng1.randint(500, size=nnz) + cols = rng2.randint(nr_vocab, size=nnz) + data = rng1.rand(nnz) + + a_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab)) + a_sparse = a_sparse.tocsr() + + row = rng1.randint(n_samples, size=nnz) + cols = rng2.randint(nr_vocab, size=nnz) + data = rng1.rand(nnz) + + b_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab)) + b_sparse = b_sparse.tocsr() + + helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False) + helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, mem_manager_is_C=MEM_MANAGER_IS_C) + for process in range(MAX_N_PROCESSES): + n_jobs = process + 1 + helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, use_threads=USE_THREADS, n_jobs=n_jobs) + helper_awesome_cossim_topn_sparse( + a_sparse, + b_sparse, + False, + mem_manager_is_C=MEM_MANAGER_IS_C, + use_threads=USE_THREADS, + n_jobs=n_jobs + ) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index 69ecd912..1ea3b1a9 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -6,7 +6,7 @@ from scipy.sparse.csr import csr_matrix from scipy.sparse.csgraph import connected_components from typing import Tuple, NamedTuple, List, Optional, Union -from sparse_dot_topn import awesome_cossim_topn, awesome_cossim_true_minmax_topn_only +from sparse_dot_topn import awesome_cossim_topn from functools import wraps DEFAULT_NGRAM_SIZE: int = 3 @@ -219,16 +219,16 @@ def __init__(self, master: pd.Series, self._master_id: pd.Series = master_id if master_id is not None else None self._duplicates_id: pd.Series = duplicates_id if duplicates_id is not None else None self._config: StringGrouperConfig = StringGrouperConfig(**kwargs) - self._max_n_matches = DEFAULT_MAX_N_MATCHES if self._config.max_n_matches is None \ + self._max_n_matches = len(self._master) if self._config.max_n_matches is None \ else self._config.max_n_matches self._validate_group_rep_specs() self._validate_replace_na_and_drop() self.is_build = False # indicates if the grouper was fit or not self._vectorizer = TfidfVectorizer(min_df=1, analyzer=self.n_grams) - # After the StringGrouper is built, _matches_list will contain the indices and similarities of two matches - # and _true_max_n_matches will contain the true maximum number of matches over all strings in master if - # self._config.min_similarity <= 0 + # After the StringGrouper is built, _matches_list will contain the indices and similarities of the matches self._matches_list: pd.DataFrame = pd.DataFrame() + # _true_max_n_matches will contain the true maximum number of matches over all strings in master if + # self._config.min_similarity <= 0 self._true_max_n_matches = None def n_grams(self, string: str) -> List[str]: @@ -248,7 +248,7 @@ def fit(self) -> 'StringGrouper': """Builds the _matches list which contains string matches indices and similarity""" master_matrix, duplicate_matrix = self._get_tf_idf_matrices() # Calculate the matches using the cosine similarity - matches = self._build_matches(master_matrix, duplicate_matrix) + matches, self._true_max_n_matches = self._build_matches(master_matrix, duplicate_matrix) if self._duplicates is None and self._max_n_matches < self._true_max_n_matches: # the list of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A) matches = StringGrouper._symmetrize_matrix(matches) @@ -435,21 +435,12 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix optional_kwargs = dict() if self._config.number_of_processes > 1: optional_kwargs = { + 'ntop_is_flexible': self._config.max_n_matches is None, + 'return_best_topn': True, 'use_threads': True, 'n_jobs': self._config.number_of_processes } - # compute the true maximum number of matches over all strings in master: - self._true_max_n_matches = awesome_cossim_true_minmax_topn_only( - tf_idf_matrix_1, - tf_idf_matrix_2, - **optional_kwargs - ) - - if self._config.min_similarity <= 0 and self._config.max_n_matches is None: - # if kwarg max_n_matches was not set when min_similarity <= 0 then set it now to its true value - self._max_n_matches = self._true_max_n_matches - return awesome_cossim_topn( tf_idf_matrix_1, tf_idf_matrix_2, self._max_n_matches, diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py index c928bfa3..d5c1dd0b 100644 --- a/string_grouper/test/test_string_grouper.py +++ b/string_grouper/test/test_string_grouper.py @@ -334,7 +334,7 @@ def test_build_matches(self): expected_matches = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 0.]]) - np.testing.assert_array_equal(expected_matches, sg._build_matches(master, dupe).toarray()) + np.testing.assert_array_equal(expected_matches, sg._build_matches(master, dupe)[0].toarray()) def test_build_matches_list(self): """Should create the cosine similarity matrix of two series""" From 5a12efbf9c7daa0b9df781d1b9964df7839a7a9d Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Sat, 24 Apr 2021 00:31:55 +0200 Subject: [PATCH 08/29] defragmented temporary memory allocations in sparse_dot_topn routines --- setup.py | 3 + sparse_dot_topn/array_wrappers.pxd | 16 +- sparse_dot_topn/array_wrappers.pyx | 116 ++-- sparse_dot_topn/awesome_cossim_topn.py | 449 ++++++------- sparse_dot_topn/sparse_dot_topn.pyx | 498 +++++++------- sparse_dot_topn/sparse_dot_topn_parallel.cpp | 406 ++++++------ sparse_dot_topn/sparse_dot_topn_source.cpp | 658 ++++++++++--------- sparse_dot_topn/sparse_dot_topn_threaded.pyx | 346 +++++----- string_grouper/string_grouper.py | 4 +- 9 files changed, 1257 insertions(+), 1239 deletions(-) diff --git a/setup.py b/setup.py index 5cb9c5e0..577ed0d9 100644 --- a/setup.py +++ b/setup.py @@ -35,6 +35,7 @@ def finalize_options(self): './sparse_dot_topn/sparse_dot_topn_source.cpp' ], extra_compile_args=extra_compile_args, + define_macros=[('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')], language='c++') original_ext = Extension('sparse_dot_topn.sparse_dot_topn', @@ -43,6 +44,7 @@ def finalize_options(self): './sparse_dot_topn/sparse_dot_topn_source.cpp' ], extra_compile_args=extra_compile_args, + define_macros=[('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')], language='c++') threaded_ext = Extension('sparse_dot_topn.sparse_dot_topn_threaded', @@ -52,6 +54,7 @@ def finalize_options(self): './sparse_dot_topn/sparse_dot_topn_parallel.cpp' ], extra_compile_args=extra_compile_args, + define_macros=[('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')], language='c++') setup( diff --git a/sparse_dot_topn/array_wrappers.pxd b/sparse_dot_topn/array_wrappers.pxd index f3342ef5..d77e41b3 100644 --- a/sparse_dot_topn/array_wrappers.pxd +++ b/sparse_dot_topn/array_wrappers.pxd @@ -2,17 +2,17 @@ from libcpp.vector cimport vector # define a Cython array wrapper class to hold a C++ vector of ints, adhering to numpy's buffer protocol: cdef class ArrayWrapper_int: - cdef int view_count - cdef vector[int] vec - cdef Py_ssize_t shape[2] - cdef Py_ssize_t strides[2] + cdef int view_count + cdef vector[int] vec + cdef Py_ssize_t shape[2] + cdef Py_ssize_t strides[2] # define a Cython array wrapper class to hold a C++ vector of doubles, adhering to numpy's buffer protocol: cdef class ArrayWrapper_double: - cdef int view_count - cdef vector[double] vec - cdef Py_ssize_t shape[2] - cdef Py_ssize_t strides[2] + cdef int view_count + cdef vector[double] vec + cdef Py_ssize_t shape[2] + cdef Py_ssize_t strides[2] diff --git a/sparse_dot_topn/array_wrappers.pyx b/sparse_dot_topn/array_wrappers.pyx index d0dd4f3e..ee458629 100644 --- a/sparse_dot_topn/array_wrappers.pyx +++ b/sparse_dot_topn/array_wrappers.pyx @@ -3,71 +3,71 @@ from libcpp.vector cimport vector # define a Cython array wrapper class to hold a C++ vector of ints, adhering to numpy's buffer protocol: cdef class ArrayWrapper_int: - # constructor and destructor are fairly unimportant now since - # vec will be destroyed automatically. + # constructor and destructor are fairly unimportant now since + # vec will be destroyed automatically. - def __cinit__(self, vector[int]& data): - self.vec.swap(data) - self.view_count = 0 + def __cinit__(self, vector[int]& data): + self.vec.swap(data) + self.view_count = 0 - # now implement the buffer protocol for the class - # which makes it generally useful to anything that expects an array - def __getbuffer__(self, Py_buffer *buffer, int flags): - # relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class - cdef Py_ssize_t itemsize = sizeof(self.vec[0]) + # now implement the buffer protocol for the class + # which makes it generally useful to anything that expects an array + def __getbuffer__(self, Py_buffer *buffer, int flags): + # relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class + cdef Py_ssize_t itemsize = sizeof(self.vec[0]) - self.shape[1] = self.vec.size() - self.shape[0] = 1 - self.strides[1] = ( &(self.vec[1]) - &(self.vec[0])) - self.strides[0] = self.vec.size() * self.strides[1] - buffer.buf = &(self.vec[0]) - buffer.format = 'i' - buffer.internal = NULL - buffer.itemsize = itemsize - buffer.len = self.vec.size() * itemsize # product(shape) * itemsize - buffer.ndim = 2 - buffer.obj = self - buffer.readonly = 0 - buffer.shape = self.shape - buffer.strides = self.strides - buffer.suboffsets = NULL - self.view_count += 1 - - def __releasebuffer__(self, Py_buffer *buffer): - self.view_count -= 1 + self.shape[1] = self.vec.size() + self.shape[0] = 1 + self.strides[1] = ( &(self.vec[1]) - &(self.vec[0])) + self.strides[0] = self.vec.size() * self.strides[1] + buffer.buf = &(self.vec[0]) + buffer.format = 'i' + buffer.internal = NULL + buffer.itemsize = itemsize + buffer.len = self.vec.size() * itemsize # product(shape) * itemsize + buffer.ndim = 2 + buffer.obj = self + buffer.readonly = 0 + buffer.shape = self.shape + buffer.strides = self.strides + buffer.suboffsets = NULL + self.view_count += 1 + + def __releasebuffer__(self, Py_buffer *buffer): + self.view_count -= 1 # define a Cython array wrapper class to hold a C++ vector of doubles, adhering to numpy's buffer protocol: cdef class ArrayWrapper_double: - # constructor and destructor are fairly unimportant now since - # vec will be destroyed automatically. + # constructor and destructor are fairly unimportant now since + # vec will be destroyed automatically. - def __cinit__(self, vector[double]& data): - self.vec.swap(data) - self.view_count = 0 + def __cinit__(self, vector[double]& data): + self.vec.swap(data) + self.view_count = 0 - # now implement the buffer protocol for the class - # which makes it generally useful to anything that expects an array - def __getbuffer__(self, Py_buffer *buffer, int flags): - # relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class - cdef Py_ssize_t itemsize = sizeof(self.vec[0]) + # now implement the buffer protocol for the class + # which makes it generally useful to anything that expects an array + def __getbuffer__(self, Py_buffer *buffer, int flags): + # relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class + cdef Py_ssize_t itemsize = sizeof(self.vec[0]) - self.shape[1] = self.vec.size() - self.shape[0] = 1 - self.strides[1] = ( &(self.vec[1]) - &(self.vec[0])) - self.strides[0] = self.vec.size() * self.strides[1] - buffer.buf = &(self.vec[0]) - buffer.format = 'd' - buffer.internal = NULL - buffer.itemsize = itemsize - buffer.len = self.vec.size() * itemsize # product(shape) * itemsize - buffer.ndim = 2 - buffer.obj = self - buffer.readonly = 0 - buffer.shape = self.shape - buffer.strides = self.strides - buffer.suboffsets = NULL - self.view_count += 1 - - def __releasebuffer__(self, Py_buffer *buffer): - self.view_count -= 1 + self.shape[1] = self.vec.size() + self.shape[0] = 1 + self.strides[1] = ( &(self.vec[1]) - &(self.vec[0])) + self.strides[0] = self.vec.size() * self.strides[1] + buffer.buf = &(self.vec[0]) + buffer.format = 'd' + buffer.internal = NULL + buffer.itemsize = itemsize + buffer.len = self.vec.size() * itemsize # product(shape) * itemsize + buffer.ndim = 2 + buffer.obj = self + buffer.readonly = 0 + buffer.shape = self.shape + buffer.strides = self.strides + buffer.suboffsets = NULL + self.view_count += 1 + + def __releasebuffer__(self, Py_buffer *buffer): + self.view_count -= 1 diff --git a/sparse_dot_topn/awesome_cossim_topn.py b/sparse_dot_topn/awesome_cossim_topn.py index 6e459b29..efce38bd 100644 --- a/sparse_dot_topn/awesome_cossim_topn.py +++ b/sparse_dot_topn/awesome_cossim_topn.py @@ -4,231 +4,238 @@ from scipy.sparse import isspmatrix_csr if sys.version_info[0] >= 3: - from sparse_dot_topn import sparse_dot_topn as ct - from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread + from sparse_dot_topn import sparse_dot_topn as ct + from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread else: - import sparse_dot_topn as ct - import sparse_dot_topn_threaded as ct_thread + import sparse_dot_topn as ct + import sparse_dot_topn_threaded as ct_thread def awesome_cossim_topn( - A, - B, - ntop, - lower_bound=0, - use_threads=False, - n_jobs=1, - ntop_is_flexible=False, - mem_manager_is_C=False, - return_best_topn=False - ): - """ - This function will return a matrix C in CSR format, where - C = [sorted top n results > lower_bound for each row of A * B]. - If return_best_topn=True it will also return best_topn (the - true maximum number of elements > lower_bound per row of A * B). - - Input: - A and B: two CSR matrices - ntop: n top results - lower_bound: a threshold that the element of A*B must greater than - use_threads: use multi-thread or not - n_jobs: number of thread, must be >= 1 - ntop_is_flexible: if True, memory management will be handed over to C/C++ if - python's attempt at allocating memory fails. - mem_manager_is_C: (this is mainly for testing purposes) if True, will force - memory management to be handed over to C/C++. Should be - used only when ntop >= number of columns of B or - ntop_is_flexible=True. Defaults to False. - return_best_topn: if True, will return best_topn together with C as a tuple: - (C, best_topn) - - Output: - C: result matrix (returned alone, if return_best_topn=False) - best_topn: The true maximum number of elements > lower_bound per row of - A * B returned together with C as a tuple: (C, best_topn). It is - returned only if return_best_topn=True. - - N.B. if A and B are not in CSR format, they will be converted to CSR - """ - if not isspmatrix_csr(A): - A = A.tocsr() - - if not isspmatrix_csr(B): - B = B.tocsr() - - M, K1 = A.shape - K2, N = B.shape - - if K1 != K2: - err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!' - raise ValueError(err_str) - - idx_dtype = np.int32 - - nnz_max = M*ntop - - # basic check. if A or B are all zeros matrix, return all zero matrix directly - if len(A.indices) == 0 or len(B.indices) == 0: - indptr = np.zeros(M + 1, dtype=idx_dtype) - indices = np.zeros(nnz_max, dtype=idx_dtype) - data = np.zeros(nnz_max, dtype=A.dtype) - output = csr_matrix((data, indices, indptr), shape=(M, N)) - if return_best_topn: - return output, 0 - else: - return output - - # filled matrices from here on - indptr = np.empty(M+1, dtype=idx_dtype) - try: - indices = np.empty(nnz_max, dtype=idx_dtype) - data = np.empty(nnz_max, dtype=A.dtype) - if mem_manager_is_C: raise MemoryError # This is mainly for testing purposes - except MemoryError: - # if mem_manager_is_C: print('Exception raised! Continuing ...', flush=True) - if ntop_is_flexible or ntop >= N: - # It is likely you are here because nnz_max is too large. But don't give up just yet! - # sparse_dot_topn will hand over the memory allocation/management to C++. C++ will - # grow the memory allocations for these arrays as needed without any need for nnz_max. - # Note that reallocations could occur causing data to be copied to other locations - # in memory thus impacting performance - indices = np.empty(0, dtype=idx_dtype) - data = np.empty(0, dtype=A.dtype) - if not use_threads: - - indices, data, best_topn = ct.sparse_dot_free( - M, N, np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - A.data, - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - B.data, - lower_bound, - indptr - ) - - else: - - indices, data, best_topn = ct_thread.sparse_dot_free_threaded( - M, N, np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - A.data, - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - B.data, - lower_bound, - indptr, n_jobs - ) - - else: - if mem_manager_is_C: - raise Exception('When mem_manager_is_C=True, set ntop >= N, or set ntop_is_flexible=True') - else: - raise Exception('Not enough memory! Data array is too large. Try reducing the value of ntop.') - - else: - - best_topn_arr = np.full(1, 0, dtype=idx_dtype) - - if not use_threads: - - ct.sparse_dot_topn_extd( - M, N, np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - A.data, - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - B.data, - ntop, - lower_bound, - indptr, indices, data, best_topn_arr - ) - - else: - if n_jobs < 1: - err_str = 'Whenever you select the multi-thread mode, n_job must be greater than or equal to 1!' - raise ValueError(err_str) - - ct_thread.sparse_dot_topn_extd_threaded( - M, N, np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - A.data, - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - B.data, - ntop, - lower_bound, - indptr, indices, data, best_topn_arr, n_jobs - ) - - best_topn = best_topn_arr[0] - - output = csr_matrix((data, indices, indptr), shape=(M, N)) - if return_best_topn: - return output, best_topn - else: - return output + A, + B, + ntop, + lower_bound=0, + use_threads=False, + n_jobs=1, + ntop_is_flexible=False, + mem_manager_is_C=False, + return_best_topn=False + ): + """ + This function will return a matrix C in CSR format, where + C = [sorted top n results > lower_bound for each row of A * B]. + If return_best_topn=True then best_topn + (the true maximum number of elements > lower_bound per row of A * B) + will also be returned in a tuple together with C as (C, best_topn). + + Input: + A and B: two CSR matrices + ntop: top n results + lower_bound: a threshold that the element of A*B must be greater than + use_threads: use multi-thread or not + n_jobs: number of thread, must be >= 1 + ntop_is_flexible: (default: False) if True, memory management will be handed + over to C/C++ whenever python's attempt at allocating + memory fails. + mem_manager_is_C: (default: False) this is mainly for testing purposes. if + True, will force memory management to be handed over to + C/C++. Should be used only when ntop >= number of columns + of B or ntop_is_flexible=True. + return_best_topn: (default: False) if True, will return best_topn together + with C as a tuple: (C, best_topn) + + Output: + C: result matrix (returned alone, if return_best_topn=False) + best_topn: The true maximum number of elements > lower_bound per row of + A * B returned together with C as a tuple: (C, best_topn). It is + returned only if return_best_topn=True. + + N.B. if A and B are not in CSR format, they will be converted to CSR + """ + if not isspmatrix_csr(A): + A = A.tocsr() + + if not isspmatrix_csr(B): + B = B.tocsr() + + M, K1 = A.shape + K2, N = B.shape + + if K1 != K2: + err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!' + raise ValueError(err_str) + + idx_dtype = np.int32 + + nnz_max = M*ntop + + # basic check. if A or B are all zeros matrix, return all zero matrix directly + if len(A.indices) == 0 or len(B.indices) == 0: + indptr = np.zeros(M + 1, dtype=idx_dtype) + indices = np.zeros(nnz_max, dtype=idx_dtype) + data = np.zeros(nnz_max, dtype=A.dtype) + output = csr_matrix((data, indices, indptr), shape=(M, N)) + if return_best_topn: + return output, 0 + else: + return output + + # filled matrices from here on + indptr = np.empty(M + 1, dtype=idx_dtype) + try: + indices = np.empty(nnz_max, dtype=idx_dtype) + data = np.empty(nnz_max, dtype=A.dtype) + + if mem_manager_is_C: raise MemoryError # This is mainly for testing purposes + + except MemoryError: + # if mem_manager_is_C: print('Exception raised! Continuing ...', flush=True) + if ntop_is_flexible or ntop >= N: + # It is likely you are here because nnz_max is too large. But don't give up just yet! + # sparse_dot_topn will hand over the memory allocation/management to C++. C++ will + # grow the memory allocations for these arrays as needed without any need for nnz_max. + # Note that reallocations could occur causing data to be copied to other locations + # in memory thus impacting performance + indices = np.empty(0, dtype=idx_dtype) + data = np.empty(0, dtype=A.dtype) + if not use_threads: + + indices, data, best_topn = ct.sparse_dot_free( + M, N, np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + A.data, + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + B.data, + lower_bound, + indptr + ) + else: + + indices, data, best_topn = ct_thread.sparse_dot_free_threaded( + M, N, np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + A.data, + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + B.data, + lower_bound, + indptr, n_jobs + ) + else: + + if mem_manager_is_C: + raise Exception( + 'When mem_manager_is_C=True, set ntop >= B.shape[1], or set ntop_is_flexible=True' + ) + else: + raise Exception( + 'Not enough memory! Data array is too large. Try reducing the value of ntop.' + 'or set ntop_is_flexible=True' + ) + else: + # no exception was raised; then use old function (as it is expected to be the fastest) + + best_topn_arr = np.full(1, 0, dtype=idx_dtype) + + if not use_threads: + + ct.sparse_dot_topn_extd( + M, N, np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + A.data, + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + B.data, + ntop, + lower_bound, + indptr, indices, data, best_topn_arr + ) + else: + if n_jobs < 1: + err_str = 'Whenever you select the multi-thread mode, n_job must be greater than or equal to 1!' + raise ValueError(err_str) + + ct_thread.sparse_dot_topn_extd_threaded( + M, N, np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + A.data, + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + B.data, + ntop, + lower_bound, + indptr, indices, data, best_topn_arr, n_jobs + ) + best_topn = best_topn_arr[0] + + # prepare and return the output: + output = csr_matrix((data, indices, indptr), shape=(M, N)) + if return_best_topn: + return output, best_topn + else: + return output def awesome_cossim_only_max_nnz_col(A, B, use_threads=False, n_jobs=1): - """ - This function will return the maximum number of columns set - per row over all rows of A * B - - Input: - A and B: two CSR matrix - use_threads: use multi-thread or not - n_jobs: number of thread, must be >= 1 - - Output: - minmax_topn: maximum number of columns set - per row over all rows of A * B - - N.B. if A and B are not CSR format, they will be converted to CSR - """ - if not isspmatrix_csr(A): - A = A.tocsr() - - if not isspmatrix_csr(B): - B = B.tocsr() - - M, K1 = A.shape - K2, N = B.shape - - if K1 != K2: - err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!' - raise ValueError(err_str) - - idx_dtype = np.int32 - - minmax_topn = np.full(1, 0, dtype=idx_dtype) - - # basic check. if A or B are all zeros matrix, return 0 directly - if len(A.indices) == 0 or len(B.indices) == 0: - return 0 - - if not use_threads: - - ct.sparse_dot_only_max_nnz_col( - M, N, - np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - minmax_topn) - - else: - if n_jobs < 1: - err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!' - raise ValueError(err_str) - - ct_thread.sparse_dot_only_max_nnz_col_threaded( - M, N, - np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - minmax_topn, n_jobs) - - return minmax_topn[0] + """ + This function will return the maximum number of columns set + per row over all rows of A * B + + Input: + A and B: two CSR matrix + use_threads: use multi-thread or not + n_jobs: number of thread, must be >= 1 + + Output: + minmax_topn: maximum number of columns set + per row over all rows of A * B + + N.B. if A and B are not CSR format, they will be converted to CSR + """ + if not isspmatrix_csr(A): + A = A.tocsr() + + if not isspmatrix_csr(B): + B = B.tocsr() + + M, K1 = A.shape + K2, N = B.shape + + if K1 != K2: + err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!' + raise ValueError(err_str) + + idx_dtype = np.int32 + + minmax_topn = np.full(1, 0, dtype=idx_dtype) + + # basic check. if A or B are all zeros matrix, return 0 directly + if len(A.indices) == 0 or len(B.indices) == 0: + return 0 + + if not use_threads: + + ct.sparse_dot_only_max_nnz_col( + M, N, + np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + minmax_topn) + + else: + if n_jobs < 1: + err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!' + raise ValueError(err_str) + + ct_thread.sparse_dot_only_max_nnz_col_threaded( + M, N, + np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + minmax_topn, n_jobs) + + return minmax_topn[0] diff --git a/sparse_dot_topn/sparse_dot_topn.pyx b/sparse_dot_topn/sparse_dot_topn.pyx index 9c35d3e9..b4e8463d 100644 --- a/sparse_dot_topn/sparse_dot_topn.pyx +++ b/sparse_dot_topn/sparse_dot_topn.pyx @@ -5,7 +5,7 @@ # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at# -# http://www.apache.org/licenses/LICENSE-2.0# +# http://www.apache.org/licenses/LICENSE-2.0# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -30,260 +30,260 @@ np.import_array() cdef extern from "sparse_dot_topn_source.h": - cdef void sparse_dot_topn_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int topn, - double lower_bound, - int Cp[], - int Cj[], - double Cx[] - ); - - cdef void sparse_dot_topn_extd_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int topn, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], - int* nminmax - ); - - cdef void sparse_dot_free_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - double lower_bound, - int Cp[], - vector[int]* Cj, - vector[double]* Cx, - int* n_minmax - ); - - cdef void sparse_dot_only_max_nnz_col_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - int Bp[], - int Bj[], - int* max_nnz_col - ); + cdef void sparse_dot_topn_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int topn, + double lower_bound, + int Cp[], + int Cj[], + double Cx[] + ); + + cdef void sparse_dot_topn_extd_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int topn, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int* nminmax + ); + + cdef void sparse_dot_free_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + double lower_bound, + int Cp[], + vector[int]* Cj, + vector[double]* Cx, + int* n_minmax + ); + + cdef void sparse_dot_only_max_nnz_col_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + int Bp[], + int Bj[], + int* max_nnz_col + ); cpdef sparse_dot_topn( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr, - np.ndarray[int, ndim=1] c_indices, - np.ndarray[double, ndim=1] c_data - ): - """ - Cython glue function to call sparse_dot_topn C++ implementation - This function will return a matrix C in CSR format, where - C = [sorted top n results and results > lower_bound for each row of A * B] - - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix - - a_indptr, a_indices, a_data: CSR expression of A matrix - b_indptr, b_indices, b_data: CSR expression of B matrix - - ntop: n top results - lower_bound: a threshold that the element of A*B must greater than - - Output by reference: - c_indptr, c_indices, c_data: CSR expression of C matrix - - N.B. A and B must be CSR format!!! - The type of input numpy array must be aligned with types of C++ function arguments! - """ - - cdef int* Ap = &a_indptr[0] - cdef int* Aj = &a_indices[0] - cdef double* Ax = &a_data[0] - cdef int* Bp = &b_indptr[0] - cdef int* Bj = &b_indices[0] - cdef double* Bx = &b_data[0] - cdef int* Cp = &c_indptr[0] - cdef int* Cj = &c_indices[0] - cdef double* Cx = &c_data[0] - - sparse_dot_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx) - return + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr, + np.ndarray[int, ndim=1] c_indices, + np.ndarray[double, ndim=1] c_data + ): + """ + Cython glue function to call sparse_dot_topn C++ implementation + This function will return a matrix C in CSR format, where + C = [sorted top n results and results > lower_bound for each row of A * B] + + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix + + a_indptr, a_indices, a_data: CSR expression of A matrix + b_indptr, b_indices, b_data: CSR expression of B matrix + + ntop: n top results + lower_bound: a threshold that the element of A*B must greater than + + Output by reference: + c_indptr, c_indices, c_data: CSR expression of C matrix + + N.B. A and B must be CSR format!!! + The type of input numpy array must be aligned with types of C++ function arguments! + """ + + cdef int* Ap = &a_indptr[0] + cdef int* Aj = &a_indices[0] + cdef double* Ax = &a_data[0] + cdef int* Bp = &b_indptr[0] + cdef int* Bj = &b_indices[0] + cdef double* Bx = &b_data[0] + cdef int* Cp = &c_indptr[0] + cdef int* Cj = &c_indices[0] + cdef double* Cx = &c_data[0] + + sparse_dot_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx) + return cpdef sparse_dot_topn_extd( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr, - np.ndarray[int, ndim=1] c_indices, - np.ndarray[double, ndim=1] c_data, - np.ndarray[int, ndim=1] nminmax, - ): - """ - Cython glue function to call sparse_dot_topn C++ implementation - This function will return a matrix C in CSR format, where - C = [sorted top n results > lower_bound for each row of A * B] - The maximum number of elements per row of C nminmax is also returned. - - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix - - a_indptr, a_indices, a_data: CSR expression of A matrix - b_indptr, b_indices, b_data: CSR expression of B matrix - - ntop: n top results - lower_bound: a threshold that the element of A*B must greater than - - Output by reference: - c_indptr, c_indices, c_data: CSR expression of C matrix - nminmax: The maximum number of elements per row of C - - N.B. A and B must be CSR format!!! - The type of input numpy array must be aligned with types of C++ function arguments! - """ - - cdef int* Ap = &a_indptr[0] - cdef int* Aj = &a_indices[0] - cdef double* Ax = &a_data[0] - cdef int* Bp = &b_indptr[0] - cdef int* Bj = &b_indices[0] - cdef double* Bx = &b_data[0] - cdef int* Cp = &c_indptr[0] - cdef int* Cj = &c_indices[0] - cdef double* Cx = &c_data[0] - cdef int* n_minmax = &nminmax[0] - - sparse_dot_topn_extd_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax) - return + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr, + np.ndarray[int, ndim=1] c_indices, + np.ndarray[double, ndim=1] c_data, + np.ndarray[int, ndim=1] nminmax, + ): + """ + Cython glue function to call sparse_dot_topn C++ implementation + This function will return a matrix C in CSR format, where + C = [sorted top n results > lower_bound for each row of A * B] + The maximum number of elements per row of C nminmax is also returned. + + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix + + a_indptr, a_indices, a_data: CSR expression of A matrix + b_indptr, b_indices, b_data: CSR expression of B matrix + + ntop: n top results + lower_bound: a threshold that the element of A*B must greater than + + Output by reference: + c_indptr, c_indices, c_data: CSR expression of C matrix + nminmax: The maximum number of elements per row of C + + N.B. A and B must be CSR format!!! + The type of input numpy array must be aligned with types of C++ function arguments! + """ + + cdef int* Ap = &a_indptr[0] + cdef int* Aj = &a_indices[0] + cdef double* Ax = &a_data[0] + cdef int* Bp = &b_indptr[0] + cdef int* Bj = &b_indices[0] + cdef double* Bx = &b_data[0] + cdef int* Cp = &c_indptr[0] + cdef int* Cj = &c_indices[0] + cdef double* Cx = &c_data[0] + cdef int* n_minmax = &nminmax[0] + + sparse_dot_topn_extd_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax) + return cpdef sparse_dot_free( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr - ): - """ - Cython glue function to call sparse_dot_free C++ implementation - This function will return a matrix C in CSR format, where - C = [all results > lower_bound for each row of A * B] - This function lets C++ decide how to manage (grow/allocate/reallocate) memory for the - storage of these results as needed during the computation; then hands over to numpy - a pointer to the memory location where the data resides - - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix - - a_indptr, a_indices, a_data: CSR expression of A matrix - b_indptr, b_indices, b_data: CSR expression of B matrix - - lower_bound: a threshold that the element of A*B must greater than - - Output by reference: - c_indptr, c_indices, c_data: CSR expression of C matrix - - N.B. A and B must be CSR format!!! - The type of input numpy array must be aligned with types of C++ function arguments! - """ - - cdef int* Ap = &a_indptr[0] - cdef int* Aj = &a_indices[0] - cdef double* Ax = &a_data[0] - cdef int* Bp = &b_indptr[0] - cdef int* Bj = &b_indices[0] - cdef double* Bx = &b_data[0] - cdef int* Cp = &c_indptr[0] - cdef np.ndarray[int, ndim=1] nminmax = np.array([0], dtype=np.int32) - cdef int* n_minmax = &nminmax[0] - - cdef vector[int] vCj; - cdef vector[double] vCx; - - sparse_dot_free_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx, n_minmax) - - c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0) - c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0) - - return c_indices, c_data, nminmax[0] + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr + ): + """ + Cython glue function to call sparse_dot_free C++ implementation + This function will return a matrix C in CSR format, where + C = [all results > lower_bound for each row of A * B] + This function lets C++ decide how to manage (grow/allocate/reallocate) memory for the + storage of these results as needed during the computation; then hands over to numpy + a pointer to the memory location where the data resides + + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix + + a_indptr, a_indices, a_data: CSR expression of A matrix + b_indptr, b_indices, b_data: CSR expression of B matrix + + lower_bound: a threshold that the element of A*B must greater than + + Output by reference: + c_indptr, c_indices, c_data: CSR expression of C matrix + + N.B. A and B must be CSR format!!! + The type of input numpy array must be aligned with types of C++ function arguments! + """ + + cdef int* Ap = &a_indptr[0] + cdef int* Aj = &a_indices[0] + cdef double* Ax = &a_data[0] + cdef int* Bp = &b_indptr[0] + cdef int* Bj = &b_indices[0] + cdef double* Bx = &b_data[0] + cdef int* Cp = &c_indptr[0] + cdef np.ndarray[int, ndim=1] nminmax = np.array([0], dtype=np.int32) + cdef int* n_minmax = &nminmax[0] + + cdef vector[int] vCj; + cdef vector[double] vCx; + + sparse_dot_free_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx, n_minmax) + + c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0) + c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0) + + return c_indices, c_data, nminmax[0] cpdef sparse_dot_only_max_nnz_col( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[int, ndim=1] minmax_topn - ): - """ - Cython glue function to call sparse_dot_only_minmax_topn C++ implementation - This function will return the maximum number of columns set - per row over all rows of A * B - - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix - - a_indptr, a_indices: CSR indices of A matrix - b_indptr, b_indices: CSR indices of B matrix - - Output by reference: - minmax_ntop: the maximum number of columns set per row over all rows of - A * B - - N.B. A and B must be CSR format!!! - The type of input numpy array must be aligned with types of C++ function arguments! - """ - - cdef int* Ap = &a_indptr[0] - cdef int* Aj = &a_indices[0] - cdef int* Bp = &b_indptr[0] - cdef int* Bj = &b_indices[0] - cdef int* o_minmax_topn = &minmax_topn[0] - - sparse_dot_only_max_nnz_col_source(n_row, n_col, Ap, Aj, Bp, Bj, o_minmax_topn) - return + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[int, ndim=1] minmax_topn + ): + """ + Cython glue function to call sparse_dot_only_minmax_topn C++ implementation + This function will return the maximum number of columns set + per row over all rows of A * B + + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix + + a_indptr, a_indices: CSR indices of A matrix + b_indptr, b_indices: CSR indices of B matrix + + Output by reference: + minmax_ntop: the maximum number of columns set per row over all rows of + A * B + + N.B. A and B must be CSR format!!! + The type of input numpy array must be aligned with types of C++ function arguments! + """ + + cdef int* Ap = &a_indptr[0] + cdef int* Aj = &a_indices[0] + cdef int* Bp = &b_indptr[0] + cdef int* Bj = &b_indices[0] + cdef int* o_minmax_topn = &minmax_topn[0] + + sparse_dot_only_max_nnz_col_source(n_row, n_col, Ap, Aj, Bp, Bj, o_minmax_topn) + return diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/sparse_dot_topn/sparse_dot_topn_parallel.cpp index d941248e..fa37746f 100644 --- a/sparse_dot_topn/sparse_dot_topn_parallel.cpp +++ b/sparse_dot_topn/sparse_dot_topn_parallel.cpp @@ -6,7 +6,7 @@ * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -30,81 +30,85 @@ #include "./sparse_dot_topn_source.h" #include "./sparse_dot_topn_parallel.h" +struct job_range_type {int begin; int end;}; void distribute_load( int load_sz, int n_jobs, - std::vector> &ranges + std::vector &ranges ) { - // share the load among jobs: - int equal_job_load_sz = load_sz/n_jobs; + // share the load among jobs: + int equal_job_load_sz = load_sz/n_jobs; int rem = load_sz % n_jobs; ranges.resize(n_jobs); - int start = 0; + int start = 0; for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - std::vector temp_vector(2, 0); - temp_vector[0] = start; - temp_vector[1] = start + equal_job_load_sz + ((job_nr < rem)? 1 : 0); - start = temp_vector[1]; - - ranges[job_nr] = temp_vector; + ranges[job_nr].begin = start; + ranges[job_nr].end = start + equal_job_load_sz + ((job_nr < rem)? 1 : 0); + start = ranges[job_nr].end; } } void inner_gather_function( - int start_row, - int end_row, + job_range_type job_range, int Cp[], int Cp_start, int vCj_start[], double vCx_start[], - std::vector real_candidates[] + std::vector* real_candidates, + std::vector* row_sizes ) { - int Cp_i = Cp_start; + candidate* c = real_candidates->data(); int* vCj_cursor = &vCj_start[Cp_start]; double* vCx_cursor = &vCx_start[Cp_start]; - candidate c; - for (int i = start_row; i < end_row; i++){ - Cp_i += (int) real_candidates[i].size(); - Cp[i + 1] = Cp_i; - for (unsigned int j = 0; j < real_candidates[i].size(); j++){ - c = real_candidates[i][j]; - *(vCj_cursor++) = c.index; - *(vCx_cursor++) = c.value; + + int Cp_i = Cp_start; + int* row_sizes_ptr = row_sizes->data(); + + for (int i = job_range.begin; i < job_range.end; i++){ + for (int j = 0; j < (*row_sizes_ptr); j++){ + *(vCj_cursor++) = c->index; + *(vCx_cursor++) = (c++)->value; } - real_candidates[i].clear(); + Cp_i += *(row_sizes_ptr++); + Cp[i + 1] = Cp_i; } + real_candidates->clear(); } void inner_sparse_dot_topn( - int start_row, - int end_row, + job_range_type job_range, int n_col_inner, - int ntop_inner, + int ntop_inner, double lower_bound_inner, int Ap_copy[], - int Aj_copy[], + int Aj_copy[], double Ax_copy[], int Bp_copy[], int Bj_copy[], - double Bx_copy[], - std::vector real_candidates[], + double Bx_copy[], + std::vector* real_candidates, + std::vector* row_sizes, int* total ) { std::vector next(n_col_inner,-1); std::vector sums(n_col_inner, 0); - std::vector temp_candidates; + real_candidates->reserve(job_range.end - job_range.begin); - for(int i = start_row; i < end_row; i++){ + row_sizes->resize(job_range.end - job_range.begin); + int* row_sizes_ptr = row_sizes->data(); + + for (int i = job_range.begin; i < job_range.end; i++){ int head = -2; int length = 0; + size_t sz = real_candidates->size(); int jj_start = Ap_copy[i]; int jj_end = Ap_copy[i+1]; @@ -134,7 +138,7 @@ void inner_sparse_dot_topn( candidate c; c.index = head; c.value = sums[head]; - temp_candidates.push_back(c); + real_candidates->push_back(c); } int temp = head; @@ -144,24 +148,31 @@ void inner_sparse_dot_topn( sums[temp] = 0; //clear arrays } - int len = (int)temp_candidates.size(); + int len = (int) (real_candidates->size() - sz); + + candidate* candidate_arr_begin = real_candidates->data() + sz; if (len > ntop_inner){ - std::partial_sort(temp_candidates.begin(), - temp_candidates.begin()+ntop_inner, - temp_candidates.end(), - candidate_cmp); + std::partial_sort( + candidate_arr_begin, + candidate_arr_begin + ntop_inner, + candidate_arr_begin + len, + candidate_cmp + ); len = ntop_inner; } else { - std::sort(temp_candidates.begin(), - temp_candidates.end(), candidate_cmp); + std::sort( + candidate_arr_begin, + candidate_arr_begin + len, + candidate_cmp + ); } + real_candidates->resize(sz + (size_t) len); + *(row_sizes_ptr++) = len; (*total) += len; - temp_candidates.resize(len); - real_candidates[i].swap(temp_candidates); - real_candidates[i].shrink_to_fit(); } + real_candidates->shrink_to_fit(); } void sparse_dot_topn_parallel( @@ -181,13 +192,11 @@ void sparse_dot_topn_parallel( int n_jobs ) { - std::vector> real_candidates(n_row); - std::vector *real_cand_pointer; - real_cand_pointer = &real_candidates[0]; - + std::vector job_ranges(n_jobs); + distribute_load(n_row, n_jobs, job_ranges); - std::vector> split_row_vector(n_jobs); - distribute_load(n_row, n_jobs, split_row_vector); + std::vector > real_candidates(n_jobs); + std::vector> row_sizes(n_jobs); // initialize aggregate: std::vector sub_total(n_jobs, 0); @@ -195,53 +204,48 @@ void sparse_dot_topn_parallel( std::vector thread_list(n_jobs); for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - int start_row = split_row_vector[job_nr][0]; - int end_row = split_row_vector[job_nr][1]; - - thread_list[job_nr] = std::thread( - inner_sparse_dot_topn, - start_row, end_row, + thread_list[job_nr] = std::thread( + inner_sparse_dot_topn, + job_ranges[job_nr], n_col, ntop, lower_bound, Ap, Aj, Ax, Bp, Bj, Bx, - real_cand_pointer, + &real_candidates[job_nr], + &row_sizes[job_nr], &sub_total[job_nr] ); - } + } - for (int job_nr = 0; job_nr < n_jobs; job_nr++) - thread_list[job_nr].join(); + for (int job_nr = 0; job_nr < n_jobs; job_nr++) + thread_list[job_nr].join(); - // gather the results: - std::vector start_points(n_jobs + 1); - start_points[0] = 0; - std::partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1); + // gather the results: + std::vector start_points(n_jobs + 1); + start_points[0] = 0; + partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1); - Cp[0] = 0; + Cp[0] = 0; for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - int start_row = split_row_vector[job_nr][0]; - int end_row = split_row_vector[job_nr][1]; - - thread_list[job_nr] = std::thread( - inner_gather_function, - start_row, end_row, - Cp, + thread_list[job_nr] = std::thread( + inner_gather_function, + job_ranges[job_nr], + Cp, start_points[job_nr], Cj, Cx, - real_cand_pointer + &real_candidates[job_nr], + &row_sizes[job_nr] ); - } + } - for (int job_nr = 0; job_nr < n_jobs; job_nr++) - thread_list[job_nr].join(); + for (int job_nr = 0; job_nr < n_jobs; job_nr++) + thread_list[job_nr].join(); } void inner_sparse_dot_topn_extd( - int start_row, - int end_row, + job_range_type job_range, int n_col_inner, int ntop_inner, double lower_bound_inner, @@ -251,25 +255,25 @@ void inner_sparse_dot_topn_extd( int Bp_copy[], int Bj_copy[], double Bx_copy[], - std::vector real_candidates[], + std::vector* real_candidates, + std::vector* row_sizes, int* total, int* n_minmax ) { - std::vector next(n_col_inner,-1); std::vector sums(n_col_inner, 0); - std::vector temp_candidates; + real_candidates->reserve(job_range.end - job_range.begin); - int iterations_count = 0; + row_sizes->resize(job_range.end - job_range.begin); + int* row_sizes_ptr = row_sizes->data(); - for(int i = start_row; i < end_row; i++){ - - iterations_count += 1; + for(int i = job_range.begin; i < job_range.end; i++){ int head = -2; int length = 0; + size_t sz = real_candidates->size(); int jj_start = Ap_copy[i]; int jj_end = Ap_copy[i+1]; @@ -299,7 +303,7 @@ void inner_sparse_dot_topn_extd( candidate c; c.index = head; c.value = sums[head]; - temp_candidates.push_back(c); + real_candidates->push_back(c); } int temp = head; @@ -309,25 +313,32 @@ void inner_sparse_dot_topn_extd( sums[temp] = 0; //clear arrays } - int len = (int)temp_candidates.size(); + int len = (int) (real_candidates->size() - sz); *n_minmax = (len > *n_minmax)? len : *n_minmax; + + candidate* candidate_arr_begin = real_candidates->data() + sz; if (len > ntop_inner){ - std::partial_sort(temp_candidates.begin(), - temp_candidates.begin()+ntop_inner, - temp_candidates.end(), - candidate_cmp); + std::partial_sort( + candidate_arr_begin, + candidate_arr_begin + ntop_inner, + candidate_arr_begin + len, + candidate_cmp + ); len = ntop_inner; } else { - std::sort(temp_candidates.begin(), - temp_candidates.end(), candidate_cmp); + std::sort( + candidate_arr_begin, + candidate_arr_begin + len, + candidate_cmp + ); } + real_candidates->resize(sz + (size_t) len); + *(row_sizes_ptr++) = len; (*total) += len; - temp_candidates.resize(len); - real_candidates[i].swap(temp_candidates); - real_candidates[i].shrink_to_fit(); } + real_candidates->shrink_to_fit(); } void sparse_dot_topn_extd_parallel( @@ -348,94 +359,92 @@ void sparse_dot_topn_extd_parallel( int n_jobs ) { - std::vector> split_row_vector(n_jobs); - distribute_load(n_row, n_jobs, split_row_vector); + std::vector job_ranges(n_jobs); + distribute_load(n_row, n_jobs, job_ranges); - std::vector> real_candidates(n_row); - std::vector *real_cand_pointer; - real_cand_pointer = &real_candidates[0]; + std::vector > real_candidates(n_jobs); + std::vector> row_sizes(n_jobs); // initialize aggregates: std::vector sub_total(n_jobs, 0); - std::vector split_n_minmax(n_jobs, 0); + std::vector split_n_minmax(n_jobs, 0); - std::vector thread_list(n_jobs); + std::vector thread_list(n_jobs); for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - int start_row = split_row_vector[job_nr][0]; - int end_row = split_row_vector[job_nr][1]; - - thread_list[job_nr] = std::thread( - inner_sparse_dot_topn_extd, - start_row, end_row, + thread_list[job_nr] = std::thread( + inner_sparse_dot_topn_extd, + job_ranges[job_nr], n_col, ntop, lower_bound, Ap, Aj, Ax, Bp, Bj, Bx, - real_cand_pointer, + &real_candidates[job_nr], + &row_sizes[job_nr], &sub_total[job_nr], &split_n_minmax[job_nr] ); - } + } - for (int job_nr = 0; job_nr < n_jobs; job_nr++) - thread_list[job_nr].join(); + for (int job_nr = 0; job_nr < n_jobs; job_nr++) + thread_list[job_nr].join(); - // gather the results: - *n_minmax = *std::max_element(split_n_minmax.begin(), split_n_minmax.end()); + // gather the results: + *n_minmax = *max_element(split_n_minmax.begin(), split_n_minmax.end()); - std::vector start_points(n_jobs + 1); - start_points[0] = 0; - std::partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1); + std::vector start_points(n_jobs + 1); + start_points[0] = 0; + partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1); - Cp[0] = 0; + Cp[0] = 0; for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - int start_row = split_row_vector[job_nr][0]; - int end_row = split_row_vector[job_nr][1]; - - thread_list[job_nr] = std::thread( - inner_gather_function, - start_row, end_row, - Cp, + thread_list[job_nr] = std::thread( + inner_gather_function, + job_ranges[job_nr], + Cp, start_points[job_nr], Cj, Cx, - real_cand_pointer + &real_candidates[job_nr], + &row_sizes[job_nr] ); - } + } - for (int job_nr = 0; job_nr < n_jobs; job_nr++) - thread_list[job_nr].join(); + for (int job_nr = 0; job_nr < n_jobs; job_nr++) + thread_list[job_nr].join(); } void inner_sparse_dot_free( - int start_row, - int end_row, + job_range_type job_range, int n_col_inner, - double lower_bound_inner, + double lower_bound_inner, int Ap_copy[], - int Aj_copy[], + int Aj_copy[], double Ax_copy[], int Bp_copy[], int Bj_copy[], - double Bx_copy[], - std::vector real_candidates[], + double Bx_copy[], + std::vector* real_candidates, + std::vector* row_sizes, int* total, int* n_minmax ) { - std::vector next(n_col_inner,-1); std::vector sums(n_col_inner, 0); - std::vector temp_candidates; + real_candidates->reserve(job_range.end - job_range.begin); + + row_sizes->resize(job_range.end - job_range.begin); + int* row_sizes_ptr = row_sizes->data(); - for(int i = start_row; i < end_row; i++){ + for(int i = job_range.begin; i < job_range.end; i++){ int head = -2; int length = 0; + size_t sz = real_candidates->size(); int jj_start = Ap_copy[i]; int jj_end = Ap_copy[i+1]; @@ -465,7 +474,7 @@ void inner_sparse_dot_free( candidate c; c.index = head; c.value = sums[head]; - temp_candidates.push_back(c); + real_candidates->push_back(c); } int temp = head; @@ -475,16 +484,21 @@ void inner_sparse_dot_free( sums[temp] = 0; //clear arrays } + int len = (int) (real_candidates->size() - sz); - std::sort(temp_candidates.begin(), - temp_candidates.end(), candidate_cmp); + candidate* candidate_arr_begin = real_candidates->data() + sz; + std::sort( + candidate_arr_begin, + candidate_arr_begin + len, + candidate_cmp + ); - int len = (int) temp_candidates.size(); + real_candidates->resize(sz + (size_t) len); + *(row_sizes_ptr++) = len; (*total) += len; *n_minmax = (len > *n_minmax)? len : *n_minmax; - real_candidates[i].swap(temp_candidates); - real_candidates[i].shrink_to_fit(); } + real_candidates->shrink_to_fit(); } void sparse_dot_free_parallel( @@ -504,75 +518,71 @@ void sparse_dot_free_parallel( int n_jobs ) { - std::vector> split_row_vector(n_jobs); - distribute_load(n_row, n_jobs, split_row_vector); + std::vector job_ranges(n_jobs); + distribute_load(n_row, n_jobs, job_ranges); - std::vector> real_candidates(n_row); - std::vector *real_cand_pointer; - real_cand_pointer = &real_candidates[0]; + std::vector > real_candidates(n_jobs); + std::vector> row_sizes(n_jobs); // initialize aggregates: std::vector sub_total(n_jobs, 0); - std::vector split_n_minmax(n_jobs, 0); + std::vector split_n_minmax(n_jobs, 0); - // execute the jobs: + // execute the jobs: std::vector thread_list(n_jobs); for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - int start_row = split_row_vector[job_nr][0]; - int end_row = split_row_vector[job_nr][1]; - - thread_list[job_nr] = std::thread ( - inner_sparse_dot_free, - start_row, end_row, + thread_list[job_nr] = std::thread ( + inner_sparse_dot_free, + job_ranges[job_nr], n_col, lower_bound, - Ap, Aj, Ax, Bp, Bj, Bx, - real_cand_pointer, + Ap, Aj, Ax, Bp, Bj, Bx, + &real_candidates[job_nr], + &row_sizes[job_nr], &sub_total[job_nr], &split_n_minmax[job_nr] ); - } + } - for (int job_nr = 0; job_nr < n_jobs; job_nr++) - thread_list[job_nr].join(); + for (int job_nr = 0; job_nr < n_jobs; job_nr++) + thread_list[job_nr].join(); - // gather the results (in parallel): - *n_minmax = *std::max_element(split_n_minmax.begin(), split_n_minmax.end()); + // gather the results (in parallel): + *n_minmax = *std::max_element(split_n_minmax.begin(), split_n_minmax.end()); - std::vector start_points(n_jobs + 1); - start_points[0] = 0; - std::partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1); + std::vector start_points(n_jobs + 1); + start_points[0] = 0; + std::partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1); - int total = start_points.back(); - vCj->resize(total); - vCx->resize(total); + int total = start_points.back(); + vCj->resize(total); + vCj->shrink_to_fit(); + vCx->resize(total); + vCx->shrink_to_fit(); - Cp[0] = 0; + Cp[0] = 0; for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - int start_row = split_row_vector[job_nr][0]; - int end_row = split_row_vector[job_nr][1]; - - thread_list[job_nr] = std::thread( - inner_gather_function, - start_row, end_row, - Cp, + thread_list[job_nr] = std::thread( + inner_gather_function, + job_ranges[job_nr], + Cp, start_points[job_nr], &((*vCj)[0]), &((*vCx)[0]), - real_cand_pointer + &real_candidates[job_nr], + &row_sizes[job_nr] ); - } + } - for (int job_nr = 0; job_nr < n_jobs; job_nr++) - thread_list[job_nr].join(); + for (int job_nr = 0; job_nr < n_jobs; job_nr++) + thread_list[job_nr].join(); } void inner_sparse_only_max_nnz_col( - int start_row, - int end_row, + job_range_type job_range, int n_col_inner, int Ap_copy[], int Aj_copy[], @@ -583,7 +593,7 @@ void inner_sparse_only_max_nnz_col( { std::vector unmarked(n_col_inner, true); - for(int i = start_row; i < end_row; i++){ + for(int i = job_range.begin; i < job_range.end; i++){ int length = 0; @@ -619,29 +629,25 @@ void sparse_dot_only_max_nnz_col_parallel( int n_jobs ) { - std::vector> split_row_vector(n_jobs); - distribute_load(n_row, n_jobs, split_row_vector); + std::vector job_ranges(n_jobs); + distribute_load(n_row, n_jobs, job_ranges); - std::vector split_max_nnz_col(n_jobs, 0); - std::vector thread_list(n_jobs); + std::vector split_max_nnz_col(n_jobs, 0); + std::vector thread_list(n_jobs); for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - int start_row = split_row_vector[job_nr][0]; - int end_row = split_row_vector[job_nr][1]; - - thread_list[job_nr] = std::thread ( - inner_sparse_only_max_nnz_col, - start_row, end_row, + thread_list[job_nr] = std::thread ( + inner_sparse_only_max_nnz_col, + job_ranges[job_nr], n_col, Ap, Aj, Bp, Bj, &split_max_nnz_col[job_nr] ); - } + } - for (int job_nr = 0; job_nr < n_jobs; job_nr++) - thread_list[job_nr].join(); + for (int job_nr = 0; job_nr < n_jobs; job_nr++) + thread_list[job_nr].join(); - *max_nnz_col = *std::max_element(split_max_nnz_col.begin(), split_max_nnz_col.end()); + *max_nnz_col = *max_element(split_max_nnz_col.begin(), split_max_nnz_col.end()); } - diff --git a/sparse_dot_topn/sparse_dot_topn_source.cpp b/sparse_dot_topn/sparse_dot_topn_source.cpp index 88abbd6a..f0400f0e 100644 --- a/sparse_dot_topn/sparse_dot_topn_source.cpp +++ b/sparse_dot_topn/sparse_dot_topn_source.cpp @@ -6,7 +6,7 @@ * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -29,25 +29,25 @@ bool candidate_cmp(candidate c_i, candidate c_j) { return (c_i.value > c_j.value); } /* - C++ implementation of sparse_dot_topn + C++ implementation of sparse_dot_topn - This function will return a matrix C in CSR format, where - C = [sorted top n results > lower_bound for each row of A * B] + This function will return a matrix C in CSR format, where + C = [sorted top n results > lower_bound for each row of A * B] - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix - Ap, Aj, Ax: CSR expression of A matrix - Bp, Bj, Bx: CSR expression of B matrix + Ap, Aj, Ax: CSR expression of A matrix + Bp, Bj, Bx: CSR expression of B matrix - ntop: n top results - lower_bound: a threshold that the element of A*B must greater than + ntop: n top results + lower_bound: a threshold that the element of A*B must greater than - Output by reference: - Cp, Cj, Cx: CSR expression of C matrix + Output by reference: + Cp, Cj, Cx: CSR expression of C matrix - N.B. A and B must be CSR format!!! + N.B. A and B must be CSR format!!! */ void sparse_dot_topn_source( int n_row, @@ -65,98 +65,98 @@ void sparse_dot_topn_source( double Cx[] ) { - std::vector next(n_col,-1); - std::vector sums(n_col, 0); - - std::vector candidates; - - int nnz = 0; - - Cp[0] = 0; - - for(int i = 0; i < n_row; i++){ - int head = -2; - int length = 0; - - int jj_start = Ap[i]; - int jj_end = Ap[i+1]; - for(int jj = jj_start; jj < jj_end; jj++){ - int j = Aj[jj]; - double v = Ax[jj]; //value of A in (i,j) - - int kk_start = Bp[j]; - int kk_end = Bp[j+1]; - for(int kk = kk_start; kk < kk_end; kk++){ - int k = Bj[kk]; //kth column of B in row j - - sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i - - if(next[k] == -1){ - next[k] = head; //keep a linked list, every element points to the next column index - head = k; - length++; - } - } - } - - for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) - - if(sums[head] > lower_bound){ //append the nonzero elements - candidate c; - c.index = head; - c.value = sums[head]; - candidates.push_back(c); - } - - int temp = head; - head = next[head]; //iterate over columns - - next[temp] = -1; //clear arrays - sums[temp] = 0; //clear arrays - } - - int len = (int)candidates.size(); - if (len > ntop){ - std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp); - len = ntop; - } else { - std::sort(candidates.begin(), candidates.end(), candidate_cmp); - } - - for(int a=0; a < len; a++){ - Cj[nnz] = candidates[a].index; - Cx[nnz] = candidates[a].value; - nnz++; - } - candidates.clear(); - - Cp[i+1] = nnz; - } + std::vector next(n_col,-1); + std::vector sums(n_col, 0); + + std::vector candidates; + + int nnz = 0; + + Cp[0] = 0; + + for(int i = 0; i < n_row; i++){ + int head = -2; + int length = 0; + + int jj_start = Ap[i]; + int jj_end = Ap[i+1]; + for(int jj = jj_start; jj < jj_end; jj++){ + int j = Aj[jj]; + double v = Ax[jj]; //value of A in (i,j) + + int kk_start = Bp[j]; + int kk_end = Bp[j+1]; + for(int kk = kk_start; kk < kk_end; kk++){ + int k = Bj[kk]; //kth column of B in row j + + sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i + + if(next[k] == -1){ + next[k] = head; //keep a linked list, every element points to the next column index + head = k; + length++; + } + } + } + + for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) + + if(sums[head] > lower_bound){ //append the nonzero elements + candidate c; + c.index = head; + c.value = sums[head]; + candidates.push_back(c); + } + + int temp = head; + head = next[head]; //iterate over columns + + next[temp] = -1; //clear arrays + sums[temp] = 0; //clear arrays + } + + int len = (int)candidates.size(); + if (len > ntop){ + std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp); + len = ntop; + } else { + std::sort(candidates.begin(), candidates.end(), candidate_cmp); + } + + for(int a=0; a < len; a++){ + Cj[nnz] = candidates[a].index; + Cx[nnz] = candidates[a].value; + nnz++; + } + candidates.clear(); + + Cp[i+1] = nnz; + } } /* - C++ implementation of sparse_dot_topn_extd_source + C++ implementation of sparse_dot_topn_extd_source - This function will return a matrix C in CSR format, where - C = [sorted top n results > lower_bound for each row of A * B]. - The maximum number n_minmax of elements per row of C (assuming ntop = n_col) - is also returned. + This function will return a matrix C in CSR format, where + C = [sorted top n results > lower_bound for each row of A * B]. + The maximum number n_minmax of elements per row of C (assuming ntop = n_col) + is also returned. - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix - Ap, Aj, Ax: CSR expression of A matrix - Bp, Bj, Bx: CSR expression of B matrix + Ap, Aj, Ax: CSR expression of A matrix + Bp, Bj, Bx: CSR expression of B matrix - ntop: n top results - lower_bound: a threshold that the element of A*B must greater than + ntop: n top results + lower_bound: a threshold that the element of A*B must greater than - Output by reference: - Cp, Cj, Cx: CSR expression of C matrix - n_minmax: The maximum number of elements per row of C (assuming ntop = n_col) + Output by reference: + Cp, Cj, Cx: CSR expression of C matrix + n_minmax: The maximum number of elements per row of C (assuming ntop = n_col) - N.B. A and B must be CSR format!!! + N.B. A and B must be CSR format!!! */ void sparse_dot_topn_extd_source( int n_row, @@ -175,101 +175,101 @@ void sparse_dot_topn_extd_source( int* n_minmax ) { - std::vector next(n_col,-1); - std::vector sums(n_col, 0); - - std::vector candidates; - - int nnz = 0; - - Cp[0] = 0; - *n_minmax = 0; - - for(int i = 0; i < n_row; i++){ - int head = -2; - int length = 0; - - int jj_start = Ap[i]; - int jj_end = Ap[i+1]; - for(int jj = jj_start; jj < jj_end; jj++){ - int j = Aj[jj]; - double v = Ax[jj]; //value of A in (i,j) - - int kk_start = Bp[j]; - int kk_end = Bp[j+1]; - for(int kk = kk_start; kk < kk_end; kk++){ - int k = Bj[kk]; //kth column of B in row j - - sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i - - if(next[k] == -1){ - next[k] = head; //keep a linked list, every element points to the next column index - head = k; - length++; - } - } - } - - for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) - - if(sums[head] > lower_bound){ //append the nonzero elements - candidate c; - c.index = head; - c.value = sums[head]; - candidates.push_back(c); - } - - int temp = head; - head = next[head]; //iterate over columns - - next[temp] = -1; //clear arrays - sums[temp] = 0; //clear arrays - } - - int len = (int)candidates.size(); - *n_minmax = (len > *n_minmax)? len : *n_minmax; - if (len > ntop){ - std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp); - len = ntop; - } else { - std::sort(candidates.begin(), candidates.end(), candidate_cmp); - } - - for(int a=0; a < len; a++){ - Cj[nnz] = candidates[a].index; - Cx[nnz] = candidates[a].value; - nnz++; - } - candidates.clear(); - - Cp[i+1] = nnz; - } + std::vector next(n_col,-1); + std::vector sums(n_col, 0); + + std::vector candidates; + + int nnz = 0; + + Cp[0] = 0; + *n_minmax = 0; + + for(int i = 0; i < n_row; i++){ + int head = -2; + int length = 0; + + int jj_start = Ap[i]; + int jj_end = Ap[i+1]; + for(int jj = jj_start; jj < jj_end; jj++){ + int j = Aj[jj]; + double v = Ax[jj]; //value of A in (i,j) + + int kk_start = Bp[j]; + int kk_end = Bp[j+1]; + for(int kk = kk_start; kk < kk_end; kk++){ + int k = Bj[kk]; //kth column of B in row j + + sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i + + if(next[k] == -1){ + next[k] = head; //keep a linked list, every element points to the next column index + head = k; + length++; + } + } + } + + for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) + + if(sums[head] > lower_bound){ //append the nonzero elements + candidate c; + c.index = head; + c.value = sums[head]; + candidates.push_back(c); + } + + int temp = head; + head = next[head]; //iterate over columns + + next[temp] = -1; //clear arrays + sums[temp] = 0; //clear arrays + } + + int len = (int)candidates.size(); + *n_minmax = (len > *n_minmax)? len : *n_minmax; + if (len > ntop){ + std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp); + len = ntop; + } else { + std::sort(candidates.begin(), candidates.end(), candidate_cmp); + } + + for(int a=0; a < len; a++){ + Cj[nnz] = candidates[a].index; + Cx[nnz] = candidates[a].value; + nnz++; + } + candidates.clear(); + + Cp[i+1] = nnz; + } } /* - C++ implementation of sparse_dot_free_source + C++ implementation of sparse_dot_free_source - This function will return a matrix C in CSR format, where - C = [all results > lower_bound sorted for each row of A * B]. - It also returns the maximum number of elements per row of C. + This function will return a matrix C in CSR format, where + C = [all results > lower_bound sorted for each row of A * B]. + It also returns the maximum number of elements per row of C. - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix - Ap, Aj, Ax: CSR expression of A matrix - Bp, Bj, Bx: CSR expression of B matrix + Ap, Aj, Ax: CSR expression of A matrix + Bp, Bj, Bx: CSR expression of B matrix - memory_bound: the maximum number of elements per row of C - lower_bound: a threshold that the element of A*B must greater than + memory_bound: the maximum number of elements per row of C + lower_bound: a threshold that the element of A*B must greater than - Output by reference: - Cp: C array for idx_pointer of CSR expression of C matrix - Cj: STL vector for indices of CSR expression of C matrix - Cx: STL vector for data values of CSR expression of C matrix - n_minmax: the maximum number of elements per row of C + Output by reference: + Cp: C array for idx_pointer of CSR expression of C matrix + Cj: STL vector for indices of CSR expression of C matrix + Cx: STL vector for data values of CSR expression of C matrix + n_minmax: the maximum number of elements per row of C - N.B. A and B must be CSR format!!! + N.B. A and B must be CSR format!!! */ void sparse_dot_free_source( int n_row, @@ -292,92 +292,94 @@ void sparse_dot_free_source( Cj->reserve(sz); Cx->reserve(sz); - std::vector next(n_col,-1); - std::vector sums(n_col, 0); + std::vector next(n_col,-1); + std::vector sums(n_col, 0); - std::vector candidates; + std::vector candidates; - Cp[0] = 0; + Cp[0] = 0; - for(int i = 0; i < n_row; i++){ - int head = -2; - int length = 0; + for(int i = 0; i < n_row; i++){ + int head = -2; + int length = 0; - int jj_start = Ap[i]; - int jj_end = Ap[i+1]; - for(int jj = jj_start; jj < jj_end; jj++){ - int j = Aj[jj]; - double v = Ax[jj]; //value of A in (i,j) + int jj_start = Ap[i]; + int jj_end = Ap[i+1]; + for(int jj = jj_start; jj < jj_end; jj++){ + int j = Aj[jj]; + double v = Ax[jj]; //value of A in (i,j) - int kk_start = Bp[j]; - int kk_end = Bp[j+1]; - for(int kk = kk_start; kk < kk_end; kk++){ - int k = Bj[kk]; //kth column of B in row j + int kk_start = Bp[j]; + int kk_end = Bp[j+1]; + for(int kk = kk_start; kk < kk_end; kk++){ + int k = Bj[kk]; //kth column of B in row j - sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i + sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i - if(next[k] == -1){ - next[k] = head; //keep a linked list, every element points to the next column index - head = k; - length++; - } - } - } + if(next[k] == -1){ + next[k] = head; //keep a linked list, every element points to the next column index + head = k; + length++; + } + } + } - for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) + for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) - if(sums[head] > lower_bound){ //append the nonzero elements - candidate c; - c.index = head; - c.value = sums[head]; - candidates.push_back(c); - } + if(sums[head] > lower_bound){ //append the nonzero elements + candidate c; + c.index = head; + c.value = sums[head]; + candidates.push_back(c); + } - int temp = head; - head = next[head]; //iterate over columns + int temp = head; + head = next[head]; //iterate over columns - next[temp] = -1; //clear arrays - sums[temp] = 0; //clear arrays - } + next[temp] = -1; //clear arrays + sums[temp] = 0; //clear arrays + } - int len = (int)candidates.size(); - *n_minmax = (len > *n_minmax)? len : *n_minmax; - std::sort(candidates.begin(), candidates.end(), candidate_cmp); + int len = (int)candidates.size(); + *n_minmax = (len > *n_minmax)? len : *n_minmax; + std::sort(candidates.begin(), candidates.end(), candidate_cmp); - for(int a=0; a < len; a++){ - Cj->push_back(candidates[a].index); - Cx->push_back(candidates[a].value); - } - candidates.clear(); + for(int a=0; a < len; a++){ + Cj->push_back(candidates[a].index); + Cx->push_back(candidates[a].value); + } + candidates.clear(); - Cp[i+1] = Cj->size(); - } + Cp[i+1] = (int) (Cj->size()); + } + Cj->shrink_to_fit(); + Cx->shrink_to_fit(); } /* - C++ implementation of sparse_dot_nnz_source + C++ implementation of sparse_dot_nnz_source - This function will return the number nnz of nonzero elements - of the matrix C in CSR format, where - C = [all results > lower_bound sorted for each row of A * B] - and ntop the maximum number of elements per row of C. - This function is designed primarily to help with memory management for - very large sparse matrices. + This function will return the number nnz of nonzero elements + of the matrix C in CSR format, where + C = [all results > lower_bound sorted for each row of A * B] + and ntop the maximum number of elements per row of C. + This function is designed primarily to help with memory management for + very large sparse matrices. - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix - Ap, Aj, Ax: CSR expression of A matrix - Bp, Bj, Bx: CSR expression of B matrix + Ap, Aj, Ax: CSR expression of A matrix + Bp, Bj, Bx: CSR expression of B matrix - lower_bound: a threshold that the element of A*B must greater than + lower_bound: a threshold that the element of A*B must greater than - Output: - nnz: number of nonzero elements of matrix C - ntop: maximum number of elements per row of C + Output: + nnz: number of nonzero elements of matrix C + ntop: maximum number of elements per row of C - N.B. A and B must be CSR format!!! + N.B. A and B must be CSR format!!! */ void sparse_dot_nnz_source( int n_row, @@ -393,71 +395,71 @@ void sparse_dot_nnz_source( int* ntop ) { - std::vector next(n_col,-1); - std::vector sums(n_col, 0); - - *nnz = 0; - *ntop = 0; - - for(int i = 0; i < n_row; i++){ - int head = -2; - int length = 0; - - int jj_start = Ap[i]; - int jj_end = Ap[i+1]; - for(int jj = jj_start; jj < jj_end; jj++){ - int j = Aj[jj]; - double v = Ax[jj]; //value of A in (i,j) - - int kk_start = Bp[j]; - int kk_end = Bp[j+1]; - for(int kk = kk_start; kk < kk_end; kk++){ - int k = Bj[kk]; //kth column of B in row j - - sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i - - if(next[k] == -1){ - next[k] = head; //keep a linked list, every element points to the next column index - head = k; - length++; - } - } - } - - int nnz_k = 0; - for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) - - if(sums[head] > lower_bound) nnz_k++; //count this nonzero element in - - int temp = head; - head = next[head]; //iterate over columns - - next[temp] = -1; //clear arrays - sums[temp] = 0; //clear arrays - } - *ntop = (nnz_k > *ntop)? nnz_k : *ntop; - *nnz += nnz_k; - } + std::vector next(n_col,-1); + std::vector sums(n_col, 0); + + *nnz = 0; + *ntop = 0; + + for(int i = 0; i < n_row; i++){ + int head = -2; + int length = 0; + + int jj_start = Ap[i]; + int jj_end = Ap[i+1]; + for(int jj = jj_start; jj < jj_end; jj++){ + int j = Aj[jj]; + double v = Ax[jj]; //value of A in (i,j) + + int kk_start = Bp[j]; + int kk_end = Bp[j+1]; + for(int kk = kk_start; kk < kk_end; kk++){ + int k = Bj[kk]; //kth column of B in row j + + sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i + + if(next[k] == -1){ + next[k] = head; //keep a linked list, every element points to the next column index + head = k; + length++; + } + } + } + + int nnz_k = 0; + for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) + + if(sums[head] > lower_bound) nnz_k++; //count this nonzero element in + + int temp = head; + head = next[head]; //iterate over columns + + next[temp] = -1; //clear arrays + sums[temp] = 0; //clear arrays + } + *ntop = (nnz_k > *ntop)? nnz_k : *ntop; + *nnz += nnz_k; + } } /* - C++ implementation of sparse_dot_only_max_nnz_col_source + C++ implementation of sparse_dot_only_max_nnz_col_source - This function will return the maximum number of columns set - per row over all rows of A * B + This function will return the maximum number of columns set + per row over all rows of A * B - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix - Ap, Aj, Ax: CSR expression of A matrix - Bp, Bj, Bx: CSR expression of B matrix + Ap, Aj, Ax: CSR expression of A matrix + Bp, Bj, Bx: CSR expression of B matrix - Output by reference: - max_nnz_col: the maximum number of columns set per row - over all rows of A * B + Output by reference: + max_nnz_col: the maximum number of columns set per row + over all rows of A * B - N.B. A and B must be CSR format!!! + N.B. A and B must be CSR format!!! */ void sparse_dot_only_max_nnz_col_source( int n_row, @@ -469,29 +471,29 @@ void sparse_dot_only_max_nnz_col_source( int *max_nnz_col ) { - std::vector unmarked(n_col, true); - - *max_nnz_col = 0; - - for(int i = 0; i < n_row; i++){ - int length = 0; - - int jj_start = Ap[i]; - int jj_end = Ap[i+1]; - for(int jj = jj_start; jj < jj_end; jj++){ - int j = Aj[jj]; - - int kk_start = Bp[j]; - int kk_end = Bp[j+1]; - for(int kk = kk_start; kk < kk_end; kk++){ - int k = Bj[kk]; // kth column of B in row j - - if(unmarked[k]){ // if this k is not already marked then ... - unmarked[k] = false; // keep a record of column k - length++; - } - } - } - *max_nnz_col = (length > *max_nnz_col)? length : *max_nnz_col; - } + std::vector unmarked(n_col, true); + + *max_nnz_col = 0; + + for(int i = 0; i < n_row; i++){ + int length = 0; + + int jj_start = Ap[i]; + int jj_end = Ap[i+1]; + for(int jj = jj_start; jj < jj_end; jj++){ + int j = Aj[jj]; + + int kk_start = Bp[j]; + int kk_end = Bp[j+1]; + for(int kk = kk_start; kk < kk_end; kk++){ + int k = Bj[kk]; // kth column of B in row j + + if(unmarked[k]){ // if this k is not already marked then ... + unmarked[k] = false; // keep a record of column k + length++; + } + } + } + *max_nnz_col = (length > *max_nnz_col)? length : *max_nnz_col; + } } diff --git a/sparse_dot_topn/sparse_dot_topn_threaded.pyx b/sparse_dot_topn/sparse_dot_topn_threaded.pyx index 86c347ec..8b003f08 100644 --- a/sparse_dot_topn/sparse_dot_topn_threaded.pyx +++ b/sparse_dot_topn/sparse_dot_topn_threaded.pyx @@ -5,7 +5,7 @@ # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at# -# http://www.apache.org/licenses/LICENSE-2.0# +# http://www.apache.org/licenses/LICENSE-2.0# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -31,183 +31,183 @@ np.import_array() cdef extern from "sparse_dot_topn_parallel.h": - cdef void sparse_dot_topn_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int topn, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], - int n_jobs - ); - - cdef void sparse_dot_topn_extd_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int topn, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], - int* n_minmax, - int n_jobs - ); - - cdef void sparse_dot_free_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - double lower_bound, - int Cp[], - vector[int]* Cj, - vector[double]* Cx, - int* n_minmax, - int n_jobs - ); - - cdef void sparse_dot_only_max_nnz_col_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - int Bp[], - int Bj[], - int* max_nnz_col, - int n_jobs - ); + cdef void sparse_dot_topn_parallel( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int topn, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int n_jobs + ); + + cdef void sparse_dot_topn_extd_parallel( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int topn, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int* n_minmax, + int n_jobs + ); + + cdef void sparse_dot_free_parallel( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + double lower_bound, + int Cp[], + vector[int]* Cj, + vector[double]* Cx, + int* n_minmax, + int n_jobs + ); + + cdef void sparse_dot_only_max_nnz_col_parallel( + int n_row, + int n_col, + int Ap[], + int Aj[], + int Bp[], + int Bj[], + int* max_nnz_col, + int n_jobs + ); cpdef sparse_dot_topn_threaded( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr, - np.ndarray[int, ndim=1] c_indices, - np.ndarray[double, ndim=1] c_data, - int n_jobs - ): - - cdef int* Ap = &a_indptr[0] - cdef int* Aj = &a_indices[0] - cdef double* Ax = &a_data[0] - cdef int* Bp = &b_indptr[0] - cdef int* Bj = &b_indices[0] - cdef double* Bx = &b_data[0] - cdef int* Cp = &c_indptr[0] - cdef int* Cj = &c_indices[0] - cdef double* Cx = &c_data[0] - - sparse_dot_topn_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, - lower_bound, Cp, Cj, Cx, n_jobs) - return + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr, + np.ndarray[int, ndim=1] c_indices, + np.ndarray[double, ndim=1] c_data, + int n_jobs + ): + + cdef int* Ap = &a_indptr[0] + cdef int* Aj = &a_indices[0] + cdef double* Ax = &a_data[0] + cdef int* Bp = &b_indptr[0] + cdef int* Bj = &b_indices[0] + cdef double* Bx = &b_data[0] + cdef int* Cp = &c_indptr[0] + cdef int* Cj = &c_indices[0] + cdef double* Cx = &c_data[0] + + sparse_dot_topn_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, + lower_bound, Cp, Cj, Cx, n_jobs) + return cpdef sparse_dot_topn_extd_threaded( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr, - np.ndarray[int, ndim=1] c_indices, - np.ndarray[double, ndim=1] c_data, - np.ndarray[int, ndim=1] nminmax, - int n_jobs - ): - - cdef int* Ap = &a_indptr[0] - cdef int* Aj = &a_indices[0] - cdef double* Ax = &a_data[0] - cdef int* Bp = &b_indptr[0] - cdef int* Bj = &b_indices[0] - cdef double* Bx = &b_data[0] - cdef int* Cp = &c_indptr[0] - cdef int* Cj = &c_indices[0] - cdef double* Cx = &c_data[0] - cdef int* n_minmax = &nminmax[0] - - sparse_dot_topn_extd_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, - lower_bound, Cp, Cj, Cx, n_minmax, n_jobs) - return + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr, + np.ndarray[int, ndim=1] c_indices, + np.ndarray[double, ndim=1] c_data, + np.ndarray[int, ndim=1] nminmax, + int n_jobs + ): + + cdef int* Ap = &a_indptr[0] + cdef int* Aj = &a_indices[0] + cdef double* Ax = &a_data[0] + cdef int* Bp = &b_indptr[0] + cdef int* Bj = &b_indices[0] + cdef double* Bx = &b_data[0] + cdef int* Cp = &c_indptr[0] + cdef int* Cj = &c_indices[0] + cdef double* Cx = &c_data[0] + cdef int* n_minmax = &nminmax[0] + + sparse_dot_topn_extd_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, + lower_bound, Cp, Cj, Cx, n_minmax, n_jobs) + return cpdef sparse_dot_free_threaded( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr, - int n_jobs - ): - - cdef int* Ap = &a_indptr[0] - cdef int* Aj = &a_indices[0] - cdef double* Ax = &a_data[0] - cdef int* Bp = &b_indptr[0] - cdef int* Bj = &b_indices[0] - cdef double* Bx = &b_data[0] - cdef int* Cp = &c_indptr[0] - cdef np.ndarray[int, ndim=1] nminmax = np.array([0], dtype=np.int32) - cdef int* n_minmax = &nminmax[0] - - cdef vector[int] vCj; - cdef vector[double] vCx; - - sparse_dot_free_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx, n_minmax, n_jobs) - - c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0) - c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0) - - return c_indices, c_data, nminmax[0] + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr, + int n_jobs + ): + + cdef int* Ap = &a_indptr[0] + cdef int* Aj = &a_indices[0] + cdef double* Ax = &a_data[0] + cdef int* Bp = &b_indptr[0] + cdef int* Bj = &b_indices[0] + cdef double* Bx = &b_data[0] + cdef int* Cp = &c_indptr[0] + cdef np.ndarray[int, ndim=1] nminmax = np.array([0], dtype=np.int32) + cdef int* n_minmax = &nminmax[0] + + cdef vector[int] vCj; + cdef vector[double] vCx; + + sparse_dot_free_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx, n_minmax, n_jobs) + + c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0) + c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0) + + return c_indices, c_data, nminmax[0] cpdef sparse_dot_only_max_nnz_col_threaded( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[int, ndim=1] max_nnz_col, - int n_jobs - ): - - cdef int* Ap = &a_indptr[0] - cdef int* Aj = &a_indices[0] - cdef int* Bp = &b_indptr[0] - cdef int* Bj = &b_indices[0] - cdef int* o_max_nnz_col = &max_nnz_col[0] - - sparse_dot_only_max_nnz_col_parallel(n_row, n_col, Ap, Aj, Bp, Bj, o_max_nnz_col, n_jobs) - return + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[int, ndim=1] max_nnz_col, + int n_jobs + ): + + cdef int* Ap = &a_indptr[0] + cdef int* Aj = &a_indices[0] + cdef int* Bp = &b_indptr[0] + cdef int* Bj = &b_indices[0] + cdef int* o_max_nnz_col = &max_nnz_col[0] + + sparse_dot_only_max_nnz_col_parallel(n_row, n_col, Ap, Aj, Bp, Bj, o_max_nnz_col, n_jobs) + return diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index 1ea3b1a9..33bfb0e6 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -135,11 +135,11 @@ def match_strings(master: pd.Series, class StringGrouperConfig(NamedTuple): - """ + r""" Class with configuration variables. :param ngram_size: int. The amount of characters in each n-gram. Default is 3. - :param regex: str. The regex string used to cleanup the input string. Default is [,-./]|\s. + :param regex: str. The regex string used to cleanup the input string. Default is '[,-./]|\s'. :param max_n_matches: int. The maximum number of matches allowed per string. Default is 20. :param min_similarity: float. The minimum cosine similarity for two strings to be considered a match. Defaults to 0.8. From 30712de5d8c167d36bccf5580ee8d9f6ded7ee94 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Sat, 24 Apr 2021 20:27:12 +0200 Subject: [PATCH 09/29] made ntop always flexible (i.e., not only when ntop >= B.shape[1]) --- sparse_dot_topn/awesome_cossim_topn.py | 441 ++++++++------- sparse_dot_topn/sparse_dot_topn.pyx | 515 +++++++++--------- sparse_dot_topn/sparse_dot_topn_parallel.cpp | 27 +- sparse_dot_topn/sparse_dot_topn_parallel.h | 1 + sparse_dot_topn/sparse_dot_topn_source.cpp | 20 +- sparse_dot_topn/sparse_dot_topn_source.h | 1 + sparse_dot_topn/sparse_dot_topn_threaded.pyx | 4 +- .../test/test_awesome_cossim_topn.py | 44 +- string_grouper/string_grouper.py | 9 +- 9 files changed, 559 insertions(+), 503 deletions(-) diff --git a/sparse_dot_topn/awesome_cossim_topn.py b/sparse_dot_topn/awesome_cossim_topn.py index efce38bd..48caaa57 100644 --- a/sparse_dot_topn/awesome_cossim_topn.py +++ b/sparse_dot_topn/awesome_cossim_topn.py @@ -4,238 +4,223 @@ from scipy.sparse import isspmatrix_csr if sys.version_info[0] >= 3: - from sparse_dot_topn import sparse_dot_topn as ct - from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread + from sparse_dot_topn import sparse_dot_topn as ct + from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread else: - import sparse_dot_topn as ct - import sparse_dot_topn_threaded as ct_thread + import sparse_dot_topn as ct + import sparse_dot_topn_threaded as ct_thread def awesome_cossim_topn( - A, - B, - ntop, - lower_bound=0, - use_threads=False, - n_jobs=1, - ntop_is_flexible=False, - mem_manager_is_C=False, - return_best_topn=False - ): - """ - This function will return a matrix C in CSR format, where - C = [sorted top n results > lower_bound for each row of A * B]. - If return_best_topn=True then best_topn - (the true maximum number of elements > lower_bound per row of A * B) - will also be returned in a tuple together with C as (C, best_topn). - - Input: - A and B: two CSR matrices - ntop: top n results - lower_bound: a threshold that the element of A*B must be greater than - use_threads: use multi-thread or not - n_jobs: number of thread, must be >= 1 - ntop_is_flexible: (default: False) if True, memory management will be handed - over to C/C++ whenever python's attempt at allocating - memory fails. - mem_manager_is_C: (default: False) this is mainly for testing purposes. if - True, will force memory management to be handed over to - C/C++. Should be used only when ntop >= number of columns - of B or ntop_is_flexible=True. - return_best_topn: (default: False) if True, will return best_topn together - with C as a tuple: (C, best_topn) - - Output: - C: result matrix (returned alone, if return_best_topn=False) - best_topn: The true maximum number of elements > lower_bound per row of - A * B returned together with C as a tuple: (C, best_topn). It is - returned only if return_best_topn=True. - - N.B. if A and B are not in CSR format, they will be converted to CSR - """ - if not isspmatrix_csr(A): - A = A.tocsr() - - if not isspmatrix_csr(B): - B = B.tocsr() - - M, K1 = A.shape - K2, N = B.shape - - if K1 != K2: - err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!' - raise ValueError(err_str) - - idx_dtype = np.int32 - - nnz_max = M*ntop - - # basic check. if A or B are all zeros matrix, return all zero matrix directly - if len(A.indices) == 0 or len(B.indices) == 0: - indptr = np.zeros(M + 1, dtype=idx_dtype) - indices = np.zeros(nnz_max, dtype=idx_dtype) - data = np.zeros(nnz_max, dtype=A.dtype) - output = csr_matrix((data, indices, indptr), shape=(M, N)) - if return_best_topn: - return output, 0 - else: - return output - - # filled matrices from here on - indptr = np.empty(M + 1, dtype=idx_dtype) - try: - indices = np.empty(nnz_max, dtype=idx_dtype) - data = np.empty(nnz_max, dtype=A.dtype) - - if mem_manager_is_C: raise MemoryError # This is mainly for testing purposes - - except MemoryError: - # if mem_manager_is_C: print('Exception raised! Continuing ...', flush=True) - if ntop_is_flexible or ntop >= N: - # It is likely you are here because nnz_max is too large. But don't give up just yet! - # sparse_dot_topn will hand over the memory allocation/management to C++. C++ will - # grow the memory allocations for these arrays as needed without any need for nnz_max. - # Note that reallocations could occur causing data to be copied to other locations - # in memory thus impacting performance - indices = np.empty(0, dtype=idx_dtype) - data = np.empty(0, dtype=A.dtype) - if not use_threads: - - indices, data, best_topn = ct.sparse_dot_free( - M, N, np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - A.data, - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - B.data, - lower_bound, - indptr - ) - else: - - indices, data, best_topn = ct_thread.sparse_dot_free_threaded( - M, N, np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - A.data, - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - B.data, - lower_bound, - indptr, n_jobs - ) - else: - - if mem_manager_is_C: - raise Exception( - 'When mem_manager_is_C=True, set ntop >= B.shape[1], or set ntop_is_flexible=True' - ) - else: - raise Exception( - 'Not enough memory! Data array is too large. Try reducing the value of ntop.' - 'or set ntop_is_flexible=True' - ) - else: - # no exception was raised; then use old function (as it is expected to be the fastest) - - best_topn_arr = np.full(1, 0, dtype=idx_dtype) - - if not use_threads: - - ct.sparse_dot_topn_extd( - M, N, np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - A.data, - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - B.data, - ntop, - lower_bound, - indptr, indices, data, best_topn_arr - ) - else: - if n_jobs < 1: - err_str = 'Whenever you select the multi-thread mode, n_job must be greater than or equal to 1!' - raise ValueError(err_str) - - ct_thread.sparse_dot_topn_extd_threaded( - M, N, np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - A.data, - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - B.data, - ntop, - lower_bound, - indptr, indices, data, best_topn_arr, n_jobs - ) - best_topn = best_topn_arr[0] - - # prepare and return the output: - output = csr_matrix((data, indices, indptr), shape=(M, N)) - if return_best_topn: - return output, best_topn - else: - return output + A, + B, + ntop, + lower_bound=0, + use_threads=False, + n_jobs=1, + mem_manager_is_C=False, + return_best_topn=False + ): + """ + This function will return a matrix C in CSR format, where + C = [sorted top n results > lower_bound for each row of A * B]. + If return_best_topn=True then best_topn + (the true maximum number of elements > lower_bound per row of A * B) + will also be returned in a tuple together with C as (C, best_topn). + + Input: + A and B: two CSR matrices + ntop: top n results + lower_bound: a threshold that the element of A*B must be greater than + use_threads: use multi-thread or not + n_jobs: number of thread, must be >= 1 + mem_manager_is_C: (default: False) this is mainly for testing purposes. if + True, will force memory management to be handed over to + C/C++. + return_best_topn: (default: False) if True, will return best_topn together + with C as a tuple: (C, best_topn) + + Output: + C: result matrix (returned alone, if return_best_topn=False) + best_topn: The true maximum number of elements > lower_bound per row of + A * B returned together with C as a tuple: (C, best_topn). It is + returned only if return_best_topn=True. + + N.B. if A and B are not in CSR format, they will be converted to CSR + """ + if not isspmatrix_csr(A): + A = A.tocsr() + + if not isspmatrix_csr(B): + B = B.tocsr() + + M, K1 = A.shape + K2, N = B.shape + + if K1 != K2: + err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!' + raise ValueError(err_str) + + idx_dtype = np.int32 + + nnz_max = M*ntop + + # basic check. if A or B are all zeros matrix, return all zero matrix directly + if len(A.indices) == 0 or len(B.indices) == 0: + indptr = np.zeros(M + 1, dtype=idx_dtype) + indices = np.zeros(nnz_max, dtype=idx_dtype) + data = np.zeros(nnz_max, dtype=A.dtype) + output = csr_matrix((data, indices, indptr), shape=(M, N)) + if return_best_topn: + return output, 0 + else: + return output + + # filled matrices from here on + indptr = np.empty(M+1, dtype=idx_dtype) + try: + indices = np.empty(nnz_max, dtype=idx_dtype) + data = np.empty(nnz_max, dtype=A.dtype) + if mem_manager_is_C: raise MemoryError # This is mainly for testing purposes + except MemoryError: + # if mem_manager_is_C: print('Exception raised! Continuing ...', flush=True) + # It is likely you are here because nnz_max is too large. But don't give up just yet! + # sparse_dot_topn will hand over the memory allocation/management to C++. C++ will + # grow the memory allocations for these arrays as needed without any need for nnz_max. + # Note that reallocations could occur causing data to be copied to other locations + # in memory thus impacting performance + indices = np.empty(0, dtype=idx_dtype) + data = np.empty(0, dtype=A.dtype) + if not use_threads: + + indices, data, best_topn = ct.sparse_dot_free( + M, N, np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + A.data, + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + B.data, + ntop, lower_bound, + indptr + ) + + else: + + indices, data, best_topn = ct_thread.sparse_dot_free_threaded( + M, N, np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + A.data, + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + B.data, + ntop, lower_bound, + indptr, n_jobs + ) + + else: + # no exception was raised; then use old function (as it is expected to be the fastest) + + best_topn_arr = np.full(1, 0, dtype=idx_dtype) + + if not use_threads: + + ct.sparse_dot_topn_extd( + M, N, np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + A.data, + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + B.data, + ntop, + lower_bound, + indptr, indices, data, best_topn_arr + ) + + else: + if n_jobs < 1: + err_str = 'Whenever you select the multi-thread mode, n_job must be greater than or equal to 1!' + raise ValueError(err_str) + + ct_thread.sparse_dot_topn_extd_threaded( + M, N, np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + A.data, + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + B.data, + ntop, + lower_bound, + indptr, indices, data, best_topn_arr, n_jobs + ) + + best_topn = best_topn_arr[0] + + # prepare and return the output: + output = csr_matrix((data, indices, indptr), shape=(M, N)) + if return_best_topn: + return output, best_topn + else: + return output def awesome_cossim_only_max_nnz_col(A, B, use_threads=False, n_jobs=1): - """ - This function will return the maximum number of columns set - per row over all rows of A * B - - Input: - A and B: two CSR matrix - use_threads: use multi-thread or not - n_jobs: number of thread, must be >= 1 - - Output: - minmax_topn: maximum number of columns set - per row over all rows of A * B - - N.B. if A and B are not CSR format, they will be converted to CSR - """ - if not isspmatrix_csr(A): - A = A.tocsr() - - if not isspmatrix_csr(B): - B = B.tocsr() - - M, K1 = A.shape - K2, N = B.shape - - if K1 != K2: - err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!' - raise ValueError(err_str) - - idx_dtype = np.int32 - - minmax_topn = np.full(1, 0, dtype=idx_dtype) - - # basic check. if A or B are all zeros matrix, return 0 directly - if len(A.indices) == 0 or len(B.indices) == 0: - return 0 - - if not use_threads: - - ct.sparse_dot_only_max_nnz_col( - M, N, - np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - minmax_topn) - - else: - if n_jobs < 1: - err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!' - raise ValueError(err_str) - - ct_thread.sparse_dot_only_max_nnz_col_threaded( - M, N, - np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - minmax_topn, n_jobs) - - return minmax_topn[0] + """ + This function will return the maximum number of columns set + per row over all rows of A * B + + Input: + A and B: two CSR matrix + use_threads: use multi-thread or not + n_jobs: number of thread, must be >= 1 + + Output: + minmax_topn: maximum number of columns set + per row over all rows of A * B + + N.B. if A and B are not CSR format, they will be converted to CSR + """ + if not isspmatrix_csr(A): + A = A.tocsr() + + if not isspmatrix_csr(B): + B = B.tocsr() + + M, K1 = A.shape + K2, N = B.shape + + if K1 != K2: + err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!' + raise ValueError(err_str) + + idx_dtype = np.int32 + + minmax_topn = np.full(1, 0, dtype=idx_dtype) + + # basic check. if A or B are all zeros matrix, return 0 directly + if len(A.indices) == 0 or len(B.indices) == 0: + return 0 + + if not use_threads: + + ct.sparse_dot_only_max_nnz_col( + M, N, + np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + minmax_topn) + + else: + if n_jobs < 1: + err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!' + raise ValueError(err_str) + + ct_thread.sparse_dot_only_max_nnz_col_threaded( + M, N, + np.asarray(A.indptr, dtype=idx_dtype), + np.asarray(A.indices, dtype=idx_dtype), + np.asarray(B.indptr, dtype=idx_dtype), + np.asarray(B.indices, dtype=idx_dtype), + minmax_topn, n_jobs) + + return minmax_topn[0] diff --git a/sparse_dot_topn/sparse_dot_topn.pyx b/sparse_dot_topn/sparse_dot_topn.pyx index b4e8463d..1d9e751a 100644 --- a/sparse_dot_topn/sparse_dot_topn.pyx +++ b/sparse_dot_topn/sparse_dot_topn.pyx @@ -5,7 +5,7 @@ # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at# -# http://www.apache.org/licenses/LICENSE-2.0# +# http://www.apache.org/licenses/LICENSE-2.0# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -30,260 +30,277 @@ np.import_array() cdef extern from "sparse_dot_topn_source.h": - cdef void sparse_dot_topn_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int topn, - double lower_bound, - int Cp[], - int Cj[], - double Cx[] - ); - - cdef void sparse_dot_topn_extd_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int topn, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], - int* nminmax - ); - - cdef void sparse_dot_free_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - double lower_bound, - int Cp[], - vector[int]* Cj, - vector[double]* Cx, - int* n_minmax - ); - - cdef void sparse_dot_only_max_nnz_col_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - int Bp[], - int Bj[], - int* max_nnz_col - ); + cdef void sparse_dot_topn_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int topn, + double lower_bound, + int Cp[], + int Cj[], + double Cx[] + ); + + cdef void sparse_dot_topn_extd_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int topn, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int* nminmax + ); + + cdef void sparse_dot_free_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int ntop, + double lower_bound, + int Cp[], + vector[int]* Cj, + vector[double]* Cx, + int* n_minmax + ); + + cdef void sparse_dot_only_max_nnz_col_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + int Bp[], + int Bj[], + int* max_nnz_col + ); cpdef sparse_dot_topn( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr, - np.ndarray[int, ndim=1] c_indices, - np.ndarray[double, ndim=1] c_data - ): - """ - Cython glue function to call sparse_dot_topn C++ implementation - This function will return a matrix C in CSR format, where - C = [sorted top n results and results > lower_bound for each row of A * B] - - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix - - a_indptr, a_indices, a_data: CSR expression of A matrix - b_indptr, b_indices, b_data: CSR expression of B matrix - - ntop: n top results - lower_bound: a threshold that the element of A*B must greater than - - Output by reference: - c_indptr, c_indices, c_data: CSR expression of C matrix - - N.B. A and B must be CSR format!!! - The type of input numpy array must be aligned with types of C++ function arguments! - """ - - cdef int* Ap = &a_indptr[0] - cdef int* Aj = &a_indices[0] - cdef double* Ax = &a_data[0] - cdef int* Bp = &b_indptr[0] - cdef int* Bj = &b_indices[0] - cdef double* Bx = &b_data[0] - cdef int* Cp = &c_indptr[0] - cdef int* Cj = &c_indices[0] - cdef double* Cx = &c_data[0] - - sparse_dot_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx) - return + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr, + np.ndarray[int, ndim=1] c_indices, + np.ndarray[double, ndim=1] c_data + ): + """ + Cython glue function to call sparse_dot_topn C++ implementation + This function will return a matrix C in CSR format, where + C = [sorted top n results and results > lower_bound for each row of A * B] + + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix + + a_indptr, a_indices, a_data: CSR expression of A matrix + b_indptr, b_indices, b_data: CSR expression of B matrix + + ntop: n top results + lower_bound: a threshold that the element of A*B must greater than + + Output by reference: + c_indptr, c_indices, c_data: CSR expression of C matrix + + N.B. A and B must be CSR format!!! + The type of input numpy array must be aligned with types of C++ function arguments! + """ + + cdef int* Ap = &a_indptr[0] + cdef int* Aj = &a_indices[0] + cdef double* Ax = &a_data[0] + cdef int* Bp = &b_indptr[0] + cdef int* Bj = &b_indices[0] + cdef double* Bx = &b_data[0] + cdef int* Cp = &c_indptr[0] + cdef int* Cj = &c_indices[0] + cdef double* Cx = &c_data[0] + + sparse_dot_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx) + return cpdef sparse_dot_topn_extd( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr, - np.ndarray[int, ndim=1] c_indices, - np.ndarray[double, ndim=1] c_data, - np.ndarray[int, ndim=1] nminmax, - ): - """ - Cython glue function to call sparse_dot_topn C++ implementation - This function will return a matrix C in CSR format, where - C = [sorted top n results > lower_bound for each row of A * B] - The maximum number of elements per row of C nminmax is also returned. - - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix - - a_indptr, a_indices, a_data: CSR expression of A matrix - b_indptr, b_indices, b_data: CSR expression of B matrix - - ntop: n top results - lower_bound: a threshold that the element of A*B must greater than - - Output by reference: - c_indptr, c_indices, c_data: CSR expression of C matrix - nminmax: The maximum number of elements per row of C - - N.B. A and B must be CSR format!!! - The type of input numpy array must be aligned with types of C++ function arguments! - """ - - cdef int* Ap = &a_indptr[0] - cdef int* Aj = &a_indices[0] - cdef double* Ax = &a_data[0] - cdef int* Bp = &b_indptr[0] - cdef int* Bj = &b_indices[0] - cdef double* Bx = &b_data[0] - cdef int* Cp = &c_indptr[0] - cdef int* Cj = &c_indices[0] - cdef double* Cx = &c_data[0] - cdef int* n_minmax = &nminmax[0] - - sparse_dot_topn_extd_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax) - return + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr, + np.ndarray[int, ndim=1] c_indices, + np.ndarray[double, ndim=1] c_data, + np.ndarray[int, ndim=1] nminmax, + ): + """ + Cython glue function to call sparse_dot_topn_extd C++ + implementation. This function will return a matrix C in CSR + format, where + C = [sorted top n results > lower_bound for each row of A * B] + The maximum number nminmax of elements per row of C (assuming + n = number of columns of B) is also returned. + + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix + + a_indptr, a_indices, a_data: CSR expression of A matrix + b_indptr, b_indices, b_data: CSR expression of B matrix + + ntop: n, the number of topmost results > lower_bound for + each row of C + lower_bound: a threshold that the element of A*B must + greater than + + Output by reference: + c_indptr, c_indices, c_data: CSR expression of matrix C + nminmax: The maximum number of elements per row of C + (assuming ntop = n_col) + + N.B. A and B must be CSR format!!! + The type of input numpy array must be aligned with types + of C++ function arguments! + """ + + cdef int* Ap = &a_indptr[0] + cdef int* Aj = &a_indices[0] + cdef double* Ax = &a_data[0] + cdef int* Bp = &b_indptr[0] + cdef int* Bj = &b_indices[0] + cdef double* Bx = &b_data[0] + cdef int* Cp = &c_indptr[0] + cdef int* Cj = &c_indices[0] + cdef double* Cx = &c_data[0] + cdef int* n_minmax = &nminmax[0] + + sparse_dot_topn_extd_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax) + return cpdef sparse_dot_free( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr - ): - """ - Cython glue function to call sparse_dot_free C++ implementation - This function will return a matrix C in CSR format, where - C = [all results > lower_bound for each row of A * B] - This function lets C++ decide how to manage (grow/allocate/reallocate) memory for the - storage of these results as needed during the computation; then hands over to numpy - a pointer to the memory location where the data resides - - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix - - a_indptr, a_indices, a_data: CSR expression of A matrix - b_indptr, b_indices, b_data: CSR expression of B matrix - - lower_bound: a threshold that the element of A*B must greater than - - Output by reference: - c_indptr, c_indices, c_data: CSR expression of C matrix - - N.B. A and B must be CSR format!!! - The type of input numpy array must be aligned with types of C++ function arguments! - """ - - cdef int* Ap = &a_indptr[0] - cdef int* Aj = &a_indices[0] - cdef double* Ax = &a_data[0] - cdef int* Bp = &b_indptr[0] - cdef int* Bj = &b_indices[0] - cdef double* Bx = &b_data[0] - cdef int* Cp = &c_indptr[0] - cdef np.ndarray[int, ndim=1] nminmax = np.array([0], dtype=np.int32) - cdef int* n_minmax = &nminmax[0] - - cdef vector[int] vCj; - cdef vector[double] vCx; - - sparse_dot_free_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx, n_minmax) - - c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0) - c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0) - - return c_indices, c_data, nminmax[0] + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr + ): + """ + Cython glue function to call sparse_dot_free C++ implementation + This function will return a matrix C in CSR format, where + C = [sorted top n results > lower_bound for each row of A * B] + The maximum number nminmax of elements per row of C (assuming + n = number of columns of B) is also returned. + + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix + + a_indptr, a_indices, a_data: CSR expression of A matrix + b_indptr, b_indices, b_data: CSR expression of B matrix + + ntop: n, the number of topmost results > lower_bound for + each row of C + lower_bound: a threshold that the element of A*B must + greater than + + Output by reference: + c_indptr: index-pointer of the CSR expression of matrix C + + Returned Output: + c_indices, c_data: indices and data of the CSR expression + of matrix C + nminmax: The maximum number of elements per row of C + (assuming ntop = n_col) + + N.B. A and B must be CSR format!!! + The type of input numpy array must be aligned with types + of C++ function arguments! + """ + + cdef int* Ap = &a_indptr[0] + cdef int* Aj = &a_indices[0] + cdef double* Ax = &a_data[0] + cdef int* Bp = &b_indptr[0] + cdef int* Bj = &b_indices[0] + cdef double* Bx = &b_data[0] + cdef int* Cp = &c_indptr[0] + cdef np.ndarray[int, ndim=1] nminmax = np.array([0], dtype=np.int32) + cdef int* n_minmax = &nminmax[0] + + cdef vector[int] vCj; + cdef vector[double] vCx; + + sparse_dot_free_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, &vCj, &vCx, n_minmax) + + c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0) + c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0) + + return c_indices, c_data, nminmax[0] cpdef sparse_dot_only_max_nnz_col( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[int, ndim=1] minmax_topn - ): - """ - Cython glue function to call sparse_dot_only_minmax_topn C++ implementation - This function will return the maximum number of columns set - per row over all rows of A * B - - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix - - a_indptr, a_indices: CSR indices of A matrix - b_indptr, b_indices: CSR indices of B matrix - - Output by reference: - minmax_ntop: the maximum number of columns set per row over all rows of - A * B - - N.B. A and B must be CSR format!!! - The type of input numpy array must be aligned with types of C++ function arguments! - """ - - cdef int* Ap = &a_indptr[0] - cdef int* Aj = &a_indices[0] - cdef int* Bp = &b_indptr[0] - cdef int* Bj = &b_indices[0] - cdef int* o_minmax_topn = &minmax_topn[0] - - sparse_dot_only_max_nnz_col_source(n_row, n_col, Ap, Aj, Bp, Bj, o_minmax_topn) - return + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[int, ndim=1] minmax_topn + ): + """ + Cython glue function to call sparse_dot_only_minmax_topn C++ implementation + This function will return the maximum number of columns set + per row over all rows of A * B + + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix + + a_indptr, a_indices: CSR indices of A matrix + b_indptr, b_indices: CSR indices of B matrix + + Output by reference: + minmax_ntop: the maximum number of columns set per row over all rows of + A * B + + N.B. A and B must be CSR format!!! + The type of input numpy array must be aligned with types of C++ function arguments! + """ + + cdef int* Ap = &a_indptr[0] + cdef int* Aj = &a_indices[0] + cdef int* Bp = &b_indptr[0] + cdef int* Bj = &b_indices[0] + cdef int* o_minmax_topn = &minmax_topn[0] + + sparse_dot_only_max_nnz_col_source(n_row, n_col, Ap, Aj, Bp, Bj, o_minmax_topn) + return diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/sparse_dot_topn/sparse_dot_topn_parallel.cpp index fa37746f..19efe7d2 100644 --- a/sparse_dot_topn/sparse_dot_topn_parallel.cpp +++ b/sparse_dot_topn/sparse_dot_topn_parallel.cpp @@ -419,6 +419,7 @@ void sparse_dot_topn_extd_parallel( void inner_sparse_dot_free( job_range_type job_range, int n_col_inner, + int ntop_inner, double lower_bound_inner, int Ap_copy[], int Aj_copy[], @@ -485,18 +486,29 @@ void inner_sparse_dot_free( } int len = (int) (real_candidates->size() - sz); + *n_minmax = (len > *n_minmax)? len : *n_minmax; candidate* candidate_arr_begin = real_candidates->data() + sz; - std::sort( - candidate_arr_begin, - candidate_arr_begin + len, - candidate_cmp - ); + if (len > ntop_inner){ + std::partial_sort( + candidate_arr_begin, + candidate_arr_begin + ntop_inner, + candidate_arr_begin + len, + candidate_cmp + ); + len = ntop_inner; + } + else { + std::sort( + candidate_arr_begin, + candidate_arr_begin + len, + candidate_cmp + ); + } real_candidates->resize(sz + (size_t) len); *(row_sizes_ptr++) = len; (*total) += len; - *n_minmax = (len > *n_minmax)? len : *n_minmax; } real_candidates->shrink_to_fit(); } @@ -510,6 +522,7 @@ void sparse_dot_free_parallel( int Bp[], int Bj[], double Bx[], //data of B + int ntop, double lower_bound, int Cp[], std::vector* vCj, @@ -536,7 +549,7 @@ void sparse_dot_free_parallel( inner_sparse_dot_free, job_ranges[job_nr], n_col, - lower_bound, + ntop, lower_bound, Ap, Aj, Ax, Bp, Bj, Bx, &real_candidates[job_nr], &row_sizes[job_nr], diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.h b/sparse_dot_topn/sparse_dot_topn_parallel.h index 30dc24ef..716ca04e 100644 --- a/sparse_dot_topn/sparse_dot_topn_parallel.h +++ b/sparse_dot_topn/sparse_dot_topn_parallel.h @@ -67,6 +67,7 @@ extern void sparse_dot_free_parallel( int Bp[], int Bj[], double Bx[], //data of B + int ntop, double lower_bound, int Cp[], std::vector* Cj, diff --git a/sparse_dot_topn/sparse_dot_topn_source.cpp b/sparse_dot_topn/sparse_dot_topn_source.cpp index f0400f0e..17b8b121 100644 --- a/sparse_dot_topn/sparse_dot_topn_source.cpp +++ b/sparse_dot_topn/sparse_dot_topn_source.cpp @@ -250,8 +250,9 @@ void sparse_dot_topn_extd_source( C++ implementation of sparse_dot_free_source This function will return a matrix C in CSR format, where - C = [all results > lower_bound sorted for each row of A * B]. - It also returns the maximum number of elements per row of C. + C = [sorted top n results > lower_bound for each row of A * B]. + The maximum number n_minmax of elements per row of C (assuming ntop = n_col) + is also returned. Input: n_row: number of rows of A matrix @@ -260,7 +261,7 @@ void sparse_dot_topn_extd_source( Ap, Aj, Ax: CSR expression of A matrix Bp, Bj, Bx: CSR expression of B matrix - memory_bound: the maximum number of elements per row of C + ntop: n top results lower_bound: a threshold that the element of A*B must greater than Output by reference: @@ -280,6 +281,7 @@ void sparse_dot_free_source( int Bp[], int Bj[], double Bx[], //data of B + int ntop, double lower_bound, int Cp[], std::vector* Cj, @@ -342,7 +344,13 @@ void sparse_dot_free_source( int len = (int)candidates.size(); *n_minmax = (len > *n_minmax)? len : *n_minmax; - std::sort(candidates.begin(), candidates.end(), candidate_cmp); + + if (len > ntop){ + std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp); + len = ntop; + } else { + std::sort(candidates.begin(), candidates.end(), candidate_cmp); + } for(int a=0; a < len; a++){ Cj->push_back(candidates[a].index); @@ -350,10 +358,8 @@ void sparse_dot_free_source( } candidates.clear(); - Cp[i+1] = (int) (Cj->size()); + Cp[i+1] = Cj->size(); } - Cj->shrink_to_fit(); - Cx->shrink_to_fit(); } /* diff --git a/sparse_dot_topn/sparse_dot_topn_source.h b/sparse_dot_topn/sparse_dot_topn_source.h index 723e9acc..9580d1cf 100644 --- a/sparse_dot_topn/sparse_dot_topn_source.h +++ b/sparse_dot_topn/sparse_dot_topn_source.h @@ -70,6 +70,7 @@ extern void sparse_dot_free_source( int Bp[], int Bj[], double Bx[], //data of B + int ntop, double lower_bound, int Cp[], std::vector* Cj, diff --git a/sparse_dot_topn/sparse_dot_topn_threaded.pyx b/sparse_dot_topn/sparse_dot_topn_threaded.pyx index 8b003f08..2f858444 100644 --- a/sparse_dot_topn/sparse_dot_topn_threaded.pyx +++ b/sparse_dot_topn/sparse_dot_topn_threaded.pyx @@ -75,6 +75,7 @@ cdef extern from "sparse_dot_topn_parallel.h": int Bp[], int Bj[], double Bx[], + int ntop, double lower_bound, int Cp[], vector[int]* Cj, @@ -167,6 +168,7 @@ cpdef sparse_dot_free_threaded( np.ndarray[int, ndim=1] b_indptr, np.ndarray[int, ndim=1] b_indices, np.ndarray[double, ndim=1] b_data, + int ntop, double lower_bound, np.ndarray[int, ndim=1] c_indptr, int n_jobs @@ -185,7 +187,7 @@ cpdef sparse_dot_free_threaded( cdef vector[int] vCj; cdef vector[double] vCx; - sparse_dot_free_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx, n_minmax, n_jobs) + sparse_dot_free_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, &vCj, &vCx, n_minmax, n_jobs) c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0) c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0) diff --git a/sparse_dot_topn/test/test_awesome_cossim_topn.py b/sparse_dot_topn/test/test_awesome_cossim_topn.py index fb0d67ab..ba7dfbfc 100644 --- a/sparse_dot_topn/test/test_awesome_cossim_topn.py +++ b/sparse_dot_topn/test/test_awesome_cossim_topn.py @@ -62,8 +62,15 @@ def helper_awesome_cossim_topn_dense( use_threads=use_threads, n_jobs=n_jobs ) - awesome_result_top3 = \ - awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, 0.0, use_threads=use_threads, n_jobs=n_jobs) + awesome_result_top3 = awesome_cossim_topn( + a_csr, + b_csr_t, + NUM_CANDIDATES, + 0.0, + mem_manager_is_C=mem_manager_is_C, + use_threads=use_threads, + n_jobs=n_jobs + ) awesome_result_top3 = [list(zip(row.indices, row.data)) if len( row.data) > 0 else None for row in awesome_result_top3] # make comparable, normally not needed @@ -76,8 +83,15 @@ def helper_awesome_cossim_topn_dense( use_threads=use_threads, n_jobs=n_jobs ) - pruned_awesome_result_top3 = \ - awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, PRUNE_THRESHOLD, use_threads=use_threads, n_jobs=n_jobs) + pruned_awesome_result_top3 = awesome_cossim_topn( + a_csr, + b_csr_t, + NUM_CANDIDATES, + PRUNE_THRESHOLD, + mem_manager_is_C=mem_manager_is_C, + use_threads=use_threads, + n_jobs=n_jobs + ) pruned_awesome_result_top3 = [list(zip(row.indices, row.data)) if len( row.data) > 0 else None for row in pruned_awesome_result_top3] @@ -131,8 +145,15 @@ def helper_awesome_cossim_topn_sparse( use_threads=use_threads, n_jobs=n_jobs ) - awesome_result_top3 = \ - awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, 0.0, use_threads=use_threads, n_jobs=n_jobs) + awesome_result_top3 = awesome_cossim_topn( + a_csr, + b_csr_t, + NUM_CANDIDATES, + 0.0, + mem_manager_is_C=mem_manager_is_C, + use_threads=use_threads, + n_jobs=n_jobs + ) awesome_result_top3 = [list(zip(row.indices, row.data)) if len( row.data) > 0 else None for row in awesome_result_top3] # make comparable, normally not needed @@ -145,8 +166,15 @@ def helper_awesome_cossim_topn_sparse( use_threads=use_threads, n_jobs=n_jobs ) - pruned_awesome_result_top3 = \ - awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, PRUNE_THRESHOLD, use_threads=use_threads, n_jobs=n_jobs) + pruned_awesome_result_top3 = awesome_cossim_topn( + a_csr, + b_csr_t, + NUM_CANDIDATES, + PRUNE_THRESHOLD, + mem_manager_is_C=mem_manager_is_C, + use_threads=use_threads, + n_jobs=n_jobs + ) pruned_awesome_result_top3 = [list(zip(row.indices, row.data)) if len( row.data) > 0 else None for row in pruned_awesome_result_top3] diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index 33bfb0e6..d3eb07c6 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -218,9 +218,13 @@ def __init__(self, master: pd.Series, self._duplicates: pd.Series = duplicates if duplicates is not None else None self._master_id: pd.Series = master_id if master_id is not None else None self._duplicates_id: pd.Series = duplicates_id if duplicates_id is not None else None + self._config: StringGrouperConfig = StringGrouperConfig(**kwargs) - self._max_n_matches = len(self._master) if self._config.max_n_matches is None \ - else self._config.max_n_matches + if self._config.max_n_matches is None: + self._max_n_matches = len(self._master) if self._duplicates is None else len(self._duplicates) + else: + self._max_n_matches = self._config.max_n_matches + self._validate_group_rep_specs() self._validate_replace_na_and_drop() self.is_build = False # indicates if the grouper was fit or not @@ -435,7 +439,6 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix optional_kwargs = dict() if self._config.number_of_processes > 1: optional_kwargs = { - 'ntop_is_flexible': self._config.max_n_matches is None, 'return_best_topn': True, 'use_threads': True, 'n_jobs': self._config.number_of_processes From 4b86ab1957d1004ec486184b028fa1edfd55fba3 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Mon, 26 Apr 2021 14:24:37 +0200 Subject: [PATCH 10/29] removed code-redundancies in sparse_dot_topn --- sparse_dot_topn/sparse_dot_topn.pyx | 214 ++++++++--------- sparse_dot_topn/sparse_dot_topn_parallel.cpp | 154 +++--------- sparse_dot_topn/sparse_dot_topn_threaded.pyx | 232 ++++++++++--------- 3 files changed, 254 insertions(+), 346 deletions(-) diff --git a/sparse_dot_topn/sparse_dot_topn.pyx b/sparse_dot_topn/sparse_dot_topn.pyx index 1d9e751a..580c0f2f 100644 --- a/sparse_dot_topn/sparse_dot_topn.pyx +++ b/sparse_dot_topn/sparse_dot_topn.pyx @@ -31,80 +31,80 @@ np.import_array() cdef extern from "sparse_dot_topn_source.h": cdef void sparse_dot_topn_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int topn, - double lower_bound, - int Cp[], - int Cj[], - double Cx[] - ); + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int topn, + double lower_bound, + int Cp[], + int Cj[], + double Cx[] + ); cdef void sparse_dot_topn_extd_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int topn, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], - int* nminmax - ); + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int topn, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int* nminmax + ); cdef void sparse_dot_free_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int ntop, - double lower_bound, - int Cp[], - vector[int]* Cj, - vector[double]* Cx, - int* n_minmax - ); + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int ntop, + double lower_bound, + int Cp[], + vector[int]* Cj, + vector[double]* Cx, + int* n_minmax + ); cdef void sparse_dot_only_max_nnz_col_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - int Bp[], - int Bj[], - int* max_nnz_col - ); + int n_row, + int n_col, + int Ap[], + int Aj[], + int Bp[], + int Bj[], + int* max_nnz_col + ); cpdef sparse_dot_topn( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr, - np.ndarray[int, ndim=1] c_indices, - np.ndarray[double, ndim=1] c_data - ): + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr, + np.ndarray[int, ndim=1] c_indices, + np.ndarray[double, ndim=1] c_data +): """ Cython glue function to call sparse_dot_topn C++ implementation This function will return a matrix C in CSR format, where @@ -137,25 +137,27 @@ cpdef sparse_dot_topn( cdef int* Cj = &c_indices[0] cdef double* Cx = &c_data[0] - sparse_dot_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx) + sparse_dot_topn_source( + n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx + ) return cpdef sparse_dot_topn_extd( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr, - np.ndarray[int, ndim=1] c_indices, - np.ndarray[double, ndim=1] c_data, - np.ndarray[int, ndim=1] nminmax, - ): + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr, + np.ndarray[int, ndim=1] c_indices, + np.ndarray[double, ndim=1] c_data, + np.ndarray[int, ndim=1] nminmax, +): """ Cython glue function to call sparse_dot_topn_extd C++ implementation. This function will return a matrix C in CSR @@ -197,22 +199,24 @@ cpdef sparse_dot_topn_extd( cdef double* Cx = &c_data[0] cdef int* n_minmax = &nminmax[0] - sparse_dot_topn_extd_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax) + sparse_dot_topn_extd_source( + n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax + ) return cpdef sparse_dot_free( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr - ): + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr +): """ Cython glue function to call sparse_dot_free C++ implementation This function will return a matrix C in CSR format, where @@ -259,7 +263,9 @@ cpdef sparse_dot_free( cdef vector[int] vCj; cdef vector[double] vCx; - sparse_dot_free_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, &vCj, &vCx, n_minmax) + sparse_dot_free_source( + n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, &vCj, &vCx, n_minmax + ) c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0) c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0) @@ -268,14 +274,14 @@ cpdef sparse_dot_free( cpdef sparse_dot_only_max_nnz_col( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[int, ndim=1] minmax_topn - ): + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[int, ndim=1] minmax_topn +): """ Cython glue function to call sparse_dot_only_minmax_topn C++ implementation This function will return the maximum number of columns set diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/sparse_dot_topn/sparse_dot_topn_parallel.cpp index 19efe7d2..1b06e927 100644 --- a/sparse_dot_topn/sparse_dot_topn_parallel.cpp +++ b/sparse_dot_topn/sparse_dot_topn_parallel.cpp @@ -21,15 +21,14 @@ // April 14, 2021 #include -#include #include #include #include -#include #include "./sparse_dot_topn_source.h" #include "./sparse_dot_topn_parallel.h" + struct job_range_type {int begin; int end;}; void distribute_load( @@ -59,7 +58,7 @@ void inner_gather_function( int vCj_start[], double vCx_start[], std::vector* real_candidates, - std::vector* row_sizes + std::vector* row_nnz ) { candidate* c = real_candidates->data(); @@ -67,17 +66,16 @@ void inner_gather_function( double* vCx_cursor = &vCx_start[Cp_start]; int Cp_i = Cp_start; - int* row_sizes_ptr = row_sizes->data(); + int* row_nnz_ptr = row_nnz->data(); for (int i = job_range.begin; i < job_range.end; i++){ - for (int j = 0; j < (*row_sizes_ptr); j++){ + for (int j = 0; j < (*row_nnz_ptr); j++){ *(vCj_cursor++) = c->index; *(vCx_cursor++) = (c++)->value; } - Cp_i += *(row_sizes_ptr++); + Cp_i += *(row_nnz_ptr++); Cp[i + 1] = Cp_i; } - real_candidates->clear(); } void inner_sparse_dot_topn( @@ -92,7 +90,7 @@ void inner_sparse_dot_topn( int Bj_copy[], double Bx_copy[], std::vector* real_candidates, - std::vector* row_sizes, + std::vector* row_nnz, int* total ) { @@ -101,8 +99,8 @@ void inner_sparse_dot_topn( real_candidates->reserve(job_range.end - job_range.begin); - row_sizes->resize(job_range.end - job_range.begin); - int* row_sizes_ptr = row_sizes->data(); + row_nnz->resize(job_range.end - job_range.begin); + int* row_nnz_ptr = row_nnz->data(); for (int i = job_range.begin; i < job_range.end; i++){ @@ -169,7 +167,7 @@ void inner_sparse_dot_topn( } real_candidates->resize(sz + (size_t) len); - *(row_sizes_ptr++) = len; + *(row_nnz_ptr++) = len; (*total) += len; } real_candidates->shrink_to_fit(); @@ -195,8 +193,8 @@ void sparse_dot_topn_parallel( std::vector job_ranges(n_jobs); distribute_load(n_row, n_jobs, job_ranges); - std::vector > real_candidates(n_jobs); - std::vector> row_sizes(n_jobs); + std::vector> real_candidates(n_jobs); + std::vector> row_nnz(n_jobs); // initialize aggregate: std::vector sub_total(n_jobs, 0); @@ -211,7 +209,7 @@ void sparse_dot_topn_parallel( lower_bound, Ap, Aj, Ax, Bp, Bj, Bx, &real_candidates[job_nr], - &row_sizes[job_nr], + &row_nnz[job_nr], &sub_total[job_nr] ); } @@ -235,13 +233,12 @@ void sparse_dot_topn_parallel( Cj, Cx, &real_candidates[job_nr], - &row_sizes[job_nr] + &row_nnz[job_nr] ); } for (int job_nr = 0; job_nr < n_jobs; job_nr++) thread_list[job_nr].join(); - } void inner_sparse_dot_topn_extd( @@ -256,7 +253,7 @@ void inner_sparse_dot_topn_extd( int Bj_copy[], double Bx_copy[], std::vector* real_candidates, - std::vector* row_sizes, + std::vector* row_nnz, int* total, int* n_minmax ) @@ -266,8 +263,8 @@ void inner_sparse_dot_topn_extd( real_candidates->reserve(job_range.end - job_range.begin); - row_sizes->resize(job_range.end - job_range.begin); - int* row_sizes_ptr = row_sizes->data(); + row_nnz->resize(job_range.end - job_range.begin); + int* row_nnz_ptr = row_nnz->data(); for(int i = job_range.begin; i < job_range.end; i++){ @@ -335,7 +332,7 @@ void inner_sparse_dot_topn_extd( } real_candidates->resize(sz + (size_t) len); - *(row_sizes_ptr++) = len; + *(row_nnz_ptr++) = len; (*total) += len; } real_candidates->shrink_to_fit(); @@ -362,8 +359,8 @@ void sparse_dot_topn_extd_parallel( std::vector job_ranges(n_jobs); distribute_load(n_row, n_jobs, job_ranges); - std::vector > real_candidates(n_jobs); - std::vector> row_sizes(n_jobs); + std::vector> real_candidates(n_jobs); + std::vector> row_nnz(n_jobs); // initialize aggregates: std::vector sub_total(n_jobs, 0); @@ -380,7 +377,7 @@ void sparse_dot_topn_extd_parallel( lower_bound, Ap, Aj, Ax, Bp, Bj, Bx, &real_candidates[job_nr], - &row_sizes[job_nr], + &row_nnz[job_nr], &sub_total[job_nr], &split_n_minmax[job_nr] ); @@ -407,110 +404,12 @@ void sparse_dot_topn_extd_parallel( Cj, Cx, &real_candidates[job_nr], - &row_sizes[job_nr] + &row_nnz[job_nr] ); } for (int job_nr = 0; job_nr < n_jobs; job_nr++) thread_list[job_nr].join(); - -} - -void inner_sparse_dot_free( - job_range_type job_range, - int n_col_inner, - int ntop_inner, - double lower_bound_inner, - int Ap_copy[], - int Aj_copy[], - double Ax_copy[], - int Bp_copy[], - int Bj_copy[], - double Bx_copy[], - std::vector* real_candidates, - std::vector* row_sizes, - int* total, - int* n_minmax -) -{ - std::vector next(n_col_inner,-1); - std::vector sums(n_col_inner, 0); - - real_candidates->reserve(job_range.end - job_range.begin); - - row_sizes->resize(job_range.end - job_range.begin); - int* row_sizes_ptr = row_sizes->data(); - - for(int i = job_range.begin; i < job_range.end; i++){ - - int head = -2; - int length = 0; - size_t sz = real_candidates->size(); - - int jj_start = Ap_copy[i]; - int jj_end = Ap_copy[i+1]; - - for(int jj = jj_start; jj < jj_end; jj++){ - int j = Aj_copy[jj]; - double v = Ax_copy[jj]; //value of A in (i,j) - - int kk_start = Bp_copy[j]; - int kk_end = Bp_copy[j+1]; - for(int kk = kk_start; kk < kk_end; kk++){ - int k = Bj_copy[kk]; //kth column of B in row j - - sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i - - if(next[k] == -1){ - next[k] = head; //keep a linked list, every element points to the next column index - head = k; - length++; - } - } - } - - for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) - - if(sums[head] > lower_bound_inner){ //append the nonzero elements - candidate c; - c.index = head; - c.value = sums[head]; - real_candidates->push_back(c); - } - - int temp = head; - head = next[head]; //iterate over columns - - next[temp] = -1; //clear arrays - sums[temp] = 0; //clear arrays - } - - int len = (int) (real_candidates->size() - sz); - *n_minmax = (len > *n_minmax)? len : *n_minmax; - - candidate* candidate_arr_begin = real_candidates->data() + sz; - if (len > ntop_inner){ - std::partial_sort( - candidate_arr_begin, - candidate_arr_begin + ntop_inner, - candidate_arr_begin + len, - candidate_cmp - ); - len = ntop_inner; - } - else { - std::sort( - candidate_arr_begin, - candidate_arr_begin + len, - candidate_cmp - ); - } - - real_candidates->resize(sz + (size_t) len); - *(row_sizes_ptr++) = len; - (*total) += len; - } - real_candidates->shrink_to_fit(); } void sparse_dot_free_parallel( @@ -534,8 +433,8 @@ void sparse_dot_free_parallel( std::vector job_ranges(n_jobs); distribute_load(n_row, n_jobs, job_ranges); - std::vector > real_candidates(n_jobs); - std::vector> row_sizes(n_jobs); + std::vector> real_candidates(n_jobs); + std::vector> row_nnz(n_jobs); // initialize aggregates: std::vector sub_total(n_jobs, 0); @@ -546,13 +445,13 @@ void sparse_dot_free_parallel( for (int job_nr = 0; job_nr < n_jobs; job_nr++) { thread_list[job_nr] = std::thread ( - inner_sparse_dot_free, + inner_sparse_dot_topn_extd, job_ranges[job_nr], n_col, ntop, lower_bound, Ap, Aj, Ax, Bp, Bj, Bx, &real_candidates[job_nr], - &row_sizes[job_nr], + &row_nnz[job_nr], &sub_total[job_nr], &split_n_minmax[job_nr] ); @@ -585,13 +484,12 @@ void sparse_dot_free_parallel( &((*vCj)[0]), &((*vCx)[0]), &real_candidates[job_nr], - &row_sizes[job_nr] + &row_nnz[job_nr] ); } for (int job_nr = 0; job_nr < n_jobs; job_nr++) thread_list[job_nr].join(); - } void inner_sparse_only_max_nnz_col( diff --git a/sparse_dot_topn/sparse_dot_topn_threaded.pyx b/sparse_dot_topn/sparse_dot_topn_threaded.pyx index 2f858444..84999abc 100644 --- a/sparse_dot_topn/sparse_dot_topn_threaded.pyx +++ b/sparse_dot_topn/sparse_dot_topn_threaded.pyx @@ -32,85 +32,85 @@ np.import_array() cdef extern from "sparse_dot_topn_parallel.h": cdef void sparse_dot_topn_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int topn, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], - int n_jobs - ); + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int topn, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int n_jobs + ); cdef void sparse_dot_topn_extd_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int topn, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], - int* n_minmax, - int n_jobs - ); + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int topn, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int* n_minmax, + int n_jobs + ); cdef void sparse_dot_free_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int ntop, - double lower_bound, - int Cp[], - vector[int]* Cj, - vector[double]* Cx, - int* n_minmax, - int n_jobs - ); + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int ntop, + double lower_bound, + int Cp[], + vector[int]* Cj, + vector[double]* Cx, + int* n_minmax, + int n_jobs + ); cdef void sparse_dot_only_max_nnz_col_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - int Bp[], - int Bj[], - int* max_nnz_col, - int n_jobs - ); + int n_row, + int n_col, + int Ap[], + int Aj[], + int Bp[], + int Bj[], + int* max_nnz_col, + int n_jobs + ); cpdef sparse_dot_topn_threaded( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr, - np.ndarray[int, ndim=1] c_indices, - np.ndarray[double, ndim=1] c_data, - int n_jobs - ): + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr, + np.ndarray[int, ndim=1] c_indices, + np.ndarray[double, ndim=1] c_data, + int n_jobs +): cdef int* Ap = &a_indptr[0] cdef int* Aj = &a_indices[0] @@ -122,27 +122,28 @@ cpdef sparse_dot_topn_threaded( cdef int* Cj = &c_indices[0] cdef double* Cx = &c_data[0] - sparse_dot_topn_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, - lower_bound, Cp, Cj, Cx, n_jobs) + sparse_dot_topn_parallel( + n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_jobs + ) return cpdef sparse_dot_topn_extd_threaded( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr, - np.ndarray[int, ndim=1] c_indices, - np.ndarray[double, ndim=1] c_data, - np.ndarray[int, ndim=1] nminmax, - int n_jobs - ): + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr, + np.ndarray[int, ndim=1] c_indices, + np.ndarray[double, ndim=1] c_data, + np.ndarray[int, ndim=1] nminmax, + int n_jobs +): cdef int* Ap = &a_indptr[0] cdef int* Aj = &a_indices[0] @@ -155,24 +156,25 @@ cpdef sparse_dot_topn_extd_threaded( cdef double* Cx = &c_data[0] cdef int* n_minmax = &nminmax[0] - sparse_dot_topn_extd_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, - lower_bound, Cp, Cj, Cx, n_minmax, n_jobs) + sparse_dot_topn_extd_parallel( + n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax, n_jobs + ) return cpdef sparse_dot_free_threaded( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr, - int n_jobs - ): + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr, + int n_jobs +): cdef int* Ap = &a_indptr[0] cdef int* Aj = &a_indices[0] @@ -187,7 +189,9 @@ cpdef sparse_dot_free_threaded( cdef vector[int] vCj; cdef vector[double] vCx; - sparse_dot_free_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, &vCj, &vCx, n_minmax, n_jobs) + sparse_dot_free_parallel( + n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, &vCj, &vCx, n_minmax, n_jobs + ) c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0) c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0) @@ -195,15 +199,15 @@ cpdef sparse_dot_free_threaded( return c_indices, c_data, nminmax[0] cpdef sparse_dot_only_max_nnz_col_threaded( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[int, ndim=1] max_nnz_col, - int n_jobs - ): + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[int, ndim=1] max_nnz_col, + int n_jobs +): cdef int* Ap = &a_indptr[0] cdef int* Aj = &a_indices[0] From 2cf60a0106fb6496e3d540066e119502156f9aae Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Wed, 28 Apr 2021 11:42:24 +0200 Subject: [PATCH 11/29] made README.md "pypi.org-friendly" --- README.md | 142 +++++++++++++++++++++++++++--------------------------- 1 file changed, 71 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index 7faa5239..6d391ead 100644 --- a/README.md +++ b/README.md @@ -7,29 +7,29 @@
-:information_source: Click to see image +Click to see image
-
+
-The image displayed above is a visualization of the graph-structure of one of the groups of strings found by string_grouper. Each circle (node) represents a string, and each connecting arc (edge) represents a match between a pair of strings with a similarity score above a given threshold score (here 0.8). +The image displayed above is a visualization of the graph-structure of one of the groups of strings found by `string_grouper`. Each circle (node) represents a string, and each connecting arc (edge) represents a match between a pair of strings with a similarity score above a given threshold score (here `0.8`). -The ***centroid*** of the group, as determined by string_grouper (see [tutorials/group_representatives.md](tutorials/group_representatives.md) for an explanation), is the largest node, also with the most edges originating from it. A thick line in the image denotes a strong similarity between the nodes at its ends, while a faint thin line denotes weak similarity. +The ***centroid*** of the group, as determined by `string_grouper` (see [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for an explanation), is the largest node, also with the most edges originating from it. A thick line in the image denotes a strong similarity between the nodes at its ends, while a faint thin line denotes weak similarity. -The power of string_grouper is discernible from this image: in large datasets, string_grouper is often able to resolve indirect associations between strings even when, say, due to memory-resource-limitations, direct matches between those strings cannot be computed using conventional methods with a lower threshold similarity score. +The power of `string_grouper` is discernible from this image: in large datasets, `string_grouper` is often able to resolve indirect associations between strings even when, say, due to memory-resource-limitations, direct matches between those strings cannot be computed using conventional methods with a lower threshold similarity score.
———
-This image was designed using the graph-visualization software Gephi 0.9.2 with data generated by string_grouper operating on the [sec__edgar_company_info.csv](https://www.kaggle.com/dattapiy/sec-edgar-companies-list/version/1) sample data file. +This image was designed using the graph-visualization software Gephi 0.9.2 with data generated by `string_grouper` operating on the [sec__edgar_company_info.csv](https://www.kaggle.com/dattapiy/sec-edgar-companies-list/version/1) sample data file. ---
-**string_grouper** is a library that makes finding groups of similar strings within a single, or multiple, lists of strings easy — and fast. **string_grouper** uses **tf-idf** to calculate [**cosine similarities**](https://towardsdatascience.com/understanding-cosine-similarity-and-its-application-fd42f585296a) within a single list or between two lists of strings. The full process is described in the blog [Super Fast String Matching in Python](https://bergvca.github.io/2017/10/14/super-fast-string-matching.html). +**`string_grouper`** is a library that makes finding groups of similar strings within a single, or multiple, lists of strings easy — and fast. **`string_grouper`** uses **tf-idf** to calculate [**cosine similarities**](https://towardsdatascience.com/understanding-cosine-similarity-and-its-application-fd42f585296a) within a single list or between two lists of strings. The full process is described in the blog [Super Fast String Matching in Python](https://bergvca.github.io/2017/10/14/super-fast-string-matching.html). ## Installing -pip install string-grouper +`pip install string-grouper` ## Usage @@ -40,91 +40,91 @@ from string_grouper import match_strings, match_most_similar, \ StringGrouper ``` -As shown above, the library may be used together with pandas, and contains four high level functions (match_strings, match_most_similar, group_similar_strings, and compute_pairwise_similarities) that can be used directly, and one class (StringGrouper) that allows for a more interactive approach. +As shown above, the library may be used together with `pandas`, and contains four high level functions (`match_strings`, `match_most_similar`, `group_similar_strings`, and `compute_pairwise_similarities`) that can be used directly, and one class (`StringGrouper`) that allows for a more interactive approach. The permitted calling patterns of the four functions, and their return types, are: -| Function | Parameters | pandas Return Type | +| Function | Parameters | `pandas` Return Type | | -------------: |:-------------|:-----:| -| match_strings| (master, **kwargs)| DataFrame | -| match_strings| (master, duplicates, **kwargs)| DataFrame | -| match_strings| (master, master_id=id_series, **kwargs)| DataFrame | -| match_strings| (master, duplicates, master_id, duplicates_id, **kwargs)| DataFrame | -| match_most_similar| (master, duplicates, **kwargs)| Series (if kwarg `ignore_index=True`) otherwise DataFrame (default)| -| match_most_similar| (master, duplicates, master_id, duplicates_id, **kwargs)| DataFrame | -| group_similar_strings| (strings_to_group, **kwargs)| Series (if kwarg `ignore_index=True`) otherwise DataFrame (default)| -| group_similar_strings| (strings_to_group, strings_id, **kwargs)| DataFrame | -| compute_pairwise_similarities| (string_series_1, string_series_2, **kwargs)| Series | - -In the rest of this document the names, Series and DataFrame, refer to the familiar pandas object types. +| `match_strings`| `(master, **kwargs)`| `DataFrame` | +| `match_strings`| `(master, duplicates, **kwargs)`| `DataFrame` | +| `match_strings`| `(master, master_id=id_series, **kwargs)`| `DataFrame` | +| `match_strings`| `(master, duplicates, master_id, duplicates_id, **kwargs)`| `DataFrame` | +| `match_most_similar`| `(master, duplicates, **kwargs)`| `Series` (if kwarg `ignore_index=True`) otherwise `DataFrame` (default)| +| `match_most_similar`| `(master, duplicates, master_id, duplicates_id, **kwargs)`| `DataFrame` | +| `group_similar_strings`| `(strings_to_group, **kwargs)`| `Series` (if kwarg `ignore_index=True`) otherwise `DataFrame` (default)| +| `group_similar_strings`| `(strings_to_group, strings_id, **kwargs)`| `DataFrame` | +| `compute_pairwise_similarities`| `(string_series_1, string_series_2, **kwargs)`| `Series` | + +In the rest of this document the names, `Series` and `DataFrame`, refer to the familiar `pandas` object types. #### Parameters: |Name | Description | |:--- | :--- | -|**master** | A Series of strings to be matched with themselves (or with those in duplicates). | -|**duplicates** | A Series of strings to be matched with those of master. | -|**master_id** (or id_series) | A Series of IDs corresponding to the strings in master. | -|**duplicates_id** | A Series of IDs corresponding to the strings in duplicates. | -|**strings_to_group** | A Series of strings to be grouped. | -|**strings_id** | A Series of IDs corresponding to the strings in strings_to_group. | -|**string_series_1(_2)** | A Series of strings each of which is to be compared with its corresponding string in string_series_2(_1). | -|****kwargs** | Keyword arguments (see [below](#kwargs)).| +|**`master`** | A `Series` of strings to be matched with themselves (or with those in `duplicates`). | +|**`duplicates`** | A `Series` of strings to be matched with those of `master`. | +|**`master_id`** (or `id_series`) | A `Series` of IDs corresponding to the strings in `master`. | +|**`duplicates_id`** | A `Series` of IDs corresponding to the strings in `duplicates`. | +|**`strings_to_group`** | A `Series` of strings to be grouped. | +|**`strings_id`** | A `Series` of IDs corresponding to the strings in `strings_to_group`. | +|**`string_series_1(_2)`** | A `Series` of strings each of which is to be compared with its corresponding string in `string_series_2(_1)`. | +|**`**kwargs`** | Keyword arguments (see [below](#kwargs)).| #### Functions: * #### `match_strings` - Returns a DataFrame containing similarity-scores of all matching pairs of highly similar strings from master (and duplicates if given). Each matching pair in the output appears in its own row/record consisting of + Returns a `DataFrame` containing similarity-scores of all matching pairs of highly similar strings from `master` (and `duplicates` if given). Each matching pair in the output appears in its own row/record consisting of - 1. its "left" part: a string (with/without its index-label) from master, + 1. its "left" part: a string (with/without its index-label) from `master`, 2. its similarity score, and - 3. its "right" part: a string (with/without its index-label) from duplicates (or master if duplicates is not given), + 3. its "right" part: a string (with/without its index-label) from `duplicates` (or `master` if `duplicates` is not given), in that order. Thus the column-names of the output are a collection of three groups: - 1. The name of master and the name(s) of its index (or index-levels) all prefixed by the string `'left_'`, + 1. The name of `master` and the name(s) of its index (or index-levels) all prefixed by the string `'left_'`, 2. `'similarity'` whose column has the similarity-scores as values, and - 3. The name of duplicates (or master if duplicates is not given) and the name(s) of its index (or index-levels) prefixed by the string `'right_'`. + 3. The name of `duplicates` (or `master` if `duplicates` is not given) and the name(s) of its index (or index-levels) prefixed by the string `'right_'`. - Indexes (or their levels) only appear when the keyword argument `ignore_index=False` (the default). (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md) for a demonstration.) + Indexes (or their levels) only appear when the keyword argument `ignore_index=False` (the default). (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.) - If either master or duplicates has no name, it assumes the name `'side'` which is then prefixed as described above. Similarly, if any of the indexes (or index-levels) has no name it assumes its pandas default name (`'index'`, `'level_0'`, and so on) and is then prefixed as described above. + If either `master` or `duplicates` has no name, it assumes the name `'side'` which is then prefixed as described above. Similarly, if any of the indexes (or index-levels) has no name it assumes its `pandas` default name (`'index'`, `'level_0'`, and so on) and is then prefixed as described above. - In other words, if only parameter master is given, the function will return pairs of highly similar strings within master. This can be seen as a self-join where both 'left_' and 'right_' prefixed columns come from master. If both parameters master and duplicates are given, it will return pairs of highly similar strings between master and duplicates. This can be seen as an inner-join where 'left_' and 'right_' prefixed columns come from master and duplicates respectively. + In other words, if only parameter `master` is given, the function will return pairs of highly similar strings within `master`. This can be seen as a self-join where both `'left_'` and `'right_'` prefixed columns come from `master`. If both parameters `master` and `duplicates` are given, it will return pairs of highly similar strings between `master` and `duplicates`. This can be seen as an inner-join where `'left_'` and `'right_'` prefixed columns come from `master` and `duplicates` respectively. - The function also supports optionally inputting IDs (master_id and duplicates_id) corresponding to the strings being matched. In which case, the output includes two additional columns whose names are the names of these optional Series prefixed by 'left_' and 'right_' accordingly, and containing the IDs corresponding to the strings in the output. If any of these Series has no name, then it assumes the name `'id'` and is then prefixed as described above. + The function also supports optionally inputting IDs (`master_id` and `duplicates_id`) corresponding to the strings being matched. In which case, the output includes two additional columns whose names are the names of these optional `Series` prefixed by `'left_'` and `'right_'` accordingly, and containing the IDs corresponding to the strings in the output. If any of these `Series` has no name, then it assumes the name `'id'` and is then prefixed as described above. * #### `match_most_similar` - If `ignore_index=True`, returns a Series of strings, where for each string in duplicates the most similar string in master is returned. If there are no similar strings in master for a given string in duplicates (because there is no potential match where the cosine similarity is above the threshold \[default: 0.8\]) then the original string in duplicates is returned. The output Series thus has the same length and index as duplicates. + If `ignore_index=True`, returns a `Series` of strings, where for each string in `duplicates` the most similar string in `master` is returned. If there are no similar strings in `master` for a given string in `duplicates` (because there is no potential match where the cosine similarity is above the threshold \[default: 0.8\]) then the original string in `duplicates` is returned. The output `Series` thus has the same length and index as `duplicates`. - For example, if an input Series with the values \['foooo', 'bar', 'baz'\] is passed as the argument master, and \['foooob', 'bar', 'new'\] as the values of the argument duplicates, the function will return a Series with values: \['foooo', 'bar', 'new'\]. + For example, if an input `Series` with the values `\['foooo', 'bar', 'baz'\]` is passed as the argument `master`, and `\['foooob', 'bar', 'new'\]` as the values of the argument `duplicates`, the function will return a `Series` with values: `\['foooo', 'bar', 'new'\]`. - The name of the output Series is the same as that of master prefixed with the string `'most_similar_'`. If master has no name, it is assumed to have the name `'master'` before being prefixed. + The name of the output `Series` is the same as that of `master` prefixed with the string `'most_similar_'`. If `master` has no name, it is assumed to have the name `'master'` before being prefixed. - If `ignore_index=False` (the default), `match_most_similar` returns a DataFrame containing the same Series described above as one of its columns. So it inherits the same index and length as duplicates. The rest of its columns correspond to the index (or index-levels) of master and thus contain the index-labels of the most similar strings being output as values. If there are no similar strings in master for a given string in duplicates then the value(s) assigned to this index-column(s) for that string is `NaN` by default. However, if the keyword argument `replace_na=True`, then these `NaN` values are replaced with the index-label(s) of that string in duplicates. Note that such replacements can only occur if the indexes of master and duplicates have the same number of levels. (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md#MMS) for a demonstration.) + If `ignore_index=False` (the default), `match_most_similar` returns a `DataFrame` containing the same `Series` described above as one of its columns. So it inherits the same index and length as `duplicates`. The rest of its columns correspond to the index (or index-levels) of `master` and thus contain the index-labels of the most similar strings being output as values. If there are no similar strings in `master` for a given string in `duplicates` then the value(s) assigned to this index-column(s) for that string is `NaN` by default. However, if the keyword argument `replace_na=True`, then these `NaN` values are replaced with the index-label(s) of that string in `duplicates`. Note that such replacements can only occur if the indexes of `master` and `duplicates` have the same number of levels. (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md#MMS) for a demonstration.) - Each column-name of the output DataFrame has the same name as its corresponding column, index, or index-level of master prefixed with the string `'most_similar_'`. + Each column-name of the output `DataFrame` has the same name as its corresponding column, index, or index-level of `master` prefixed with the string `'most_similar_'`. - If both parameters master_id and duplicates_id are also given, then a DataFrame is always returned with the same column(s) as described above, but with an additional column containing those IDs from these input Series corresponding to the output strings. This column's name is the same as that of master_id prefixed in the same way as described above. If master_id has no name, it is assumed to have the name `'master_id'` before being prefixed. + If both parameters `master_id` and `duplicates_id` are also given, then a `DataFrame` is always returned with the same column(s) as described above, but with an additional column containing those IDs from these input `Series` corresponding to the output strings. This column's name is the same as that of `master_id` prefixed in the same way as described above. If `master_id` has no name, it is assumed to have the name `'master_id'` before being prefixed. * #### `group_similar_strings` - Takes a single Series of strings (strings_to_group) and groups them by assigning to each string one string from strings_to_group chosen as the group-representative for each group of similar strings found. (See [tutorials/group_representatives.md](tutorials/group_representatives.md) for details on how the the group-representatives are chosen.) + Takes a single `Series` of strings (`strings_to_group`) and groups them by assigning to each string one string from `strings_to_group` chosen as the group-representative for each group of similar strings found. (See [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for details on how the the group-representatives are chosen.) - If `ignore_index=True`, the output is a Series (with the same name as strings_to_group prefixed by the string `'group_rep_'`) of the same length and index as strings_to_group containing the group-representative strings. If strings_to_group has no name then the name of the returned Series is `'group_rep'`. + If `ignore_index=True`, the output is a `Series` (with the same name as `strings_to_group` prefixed by the string `'group_rep_'`) of the same length and index as `strings_to_group` containing the group-representative strings. If `strings_to_group` has no name then the name of the returned `Series` is `'group_rep'`. - For example, an input Series with values: \['foooo', 'foooob', 'bar'\] will return \['foooo', 'foooo', 'bar'\]. Here 'foooo' and 'foooob' are grouped together into group 'foooo' because they are found to be similar. Another example can be found [below](#dedup). + For example, an input Series with values: `\['foooo', 'foooob', 'bar'\]` will return `\['foooo', 'foooo', 'bar'\]`. Here `'foooo'` and `'foooob'` are grouped together into group `'foooo'` because they are found to be similar. Another example can be found [below](#dedup). - If `ignore_index=False`, the output is a DataFrame containing the above output Series as one of its columns with the same name. The remaining column(s) correspond to the index (or index-levels) of strings_to_group and contain the index-labels of the group-representatives as values. These columns have the same names as their counterparts prefixed by the string `'group_rep_'`. + If `ignore_index=False`, the output is a `DataFrame` containing the above output `Series` as one of its columns with the same name. The remaining column(s) correspond to the index (or index-levels) of `strings_to_group` and contain the index-labels of the group-representatives as values. These columns have the same names as their counterparts prefixed by the string `'group_rep_'`. - If strings_id is also given, then the IDs from strings_id corresponding to the group-representatives are also returned in an additional column (with the same name as strings_id prefixed as described above). If strings_id has no name, it is assumed to have the name `'id'` before being prefixed. + If `strings_id` is also given, then the IDs from `strings_id` corresponding to the group-representatives are also returned in an additional column (with the same name as `strings_id` prefixed as described above). If `strings_id` has no name, it is assumed to have the name `'id'` before being prefixed. * #### `compute_pairwise_similarities` - Returns a Series of cosine similarity scores the same length and index as string_series_1. Each score is the cosine similarity between the pair of strings in the same position (row) in the two input Series, string_series_1 and string_series_2, as the position of the score in the output Series. This can be seen as an element-wise comparison between the two input Series. + Returns a `Series` of cosine similarity scores the same length and index as `string_series_1`. Each score is the cosine similarity between the pair of strings in the same position (row) in the two input `Series`, `string_series_1` and `string_series_2`, as the position of the score in the output `Series`. This can be seen as an element-wise comparison between the two input `Series`. -All functions are built using a class **StringGrouper**. This class can be used through pre-defined functions, for example the four high level functions above, as well as using a more interactive approach where matches can be added or removed if needed by calling the **StringGrouper** class directly. +All functions are built using a class **`StringGrouper`**. This class can be used through pre-defined functions, for example the four high level functions above, as well as using a more interactive approach where matches can be added or removed if needed by calling the **`StringGrouper`** class directly. #### Options: @@ -133,17 +133,17 @@ All functions are built using a class **StringGrouper**. This class All keyword arguments not mentioned in the function definitions above are used to update the default settings. The following optional arguments can be used: - * **ngram_size**: The amount of characters in each n-gram. Default is 3. - * **regex**: The regex string used to clean-up the input string. Default is "[,-./]|\s". - * **max_n_matches**: The maximum number of matches allowed per string in master. Default is 20. - * **min_similarity**: The minimum cosine similarity for two strings to be considered a match. - Defaults to 0.8 - * **number_of_processes**: The number of processes used by the cosine similarity calculation. Defaults to + * **`ngram_size`**: The amount of characters in each n-gram. Default is `3`. + * **`regex`**: The regex string used to clean-up the input string. Default is `"[,-./]|\s"`. + * **`max_n_matches`**: The maximum number of matches allowed per string in `master`. Default is the number of strings in `duplicates` (or `master`, if `duplicates` is not given). + * **`min_similarity`**: The minimum cosine similarity for two strings to be considered a match. + Defaults to `0.8` + * **`number_of_processes`**: The number of processes used by the cosine similarity calculation. Defaults to `number of cores on a machine - 1.` - * **ignore_index**: Determines whether indexes are ignored or not. If `False` (the default), index-columns will appear in the output, otherwise not. (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md) for a demonstration.) - * **replace_na**: For function match_most_similar, determines whether `NaN` values in index-columns are replaced or not by index-labels from duplicates. Defaults to `False`. (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md) for a demonstration.) - * **include_zeroes**: When min_similarity ≤ 0, determines whether zero-similarity matches appear in the output. Defaults to `True`. (See [tutorials/zero_similarity.md](tutorials/zero_similarity.md).) **Note:** If include_zeroes is `True` and the kwarg max_n_matches is set then it must be sufficiently high to capture ***all*** nonzero-similarity-matches, otherwise an error is raised and string_grouper suggests an alternative value for max_n_matches. To allow string_grouper to automatically use the appropriate value for max_n_matches then do not set this kwarg at all. - * **group_rep**: For function group_similar_strings, determines how group-representatives are chosen. Allowed values are `'centroid'` (the default) and `'first'`. See [tutorials/group_representatives.md](tutorials/group_representatives.md) for an explanation. + * **`ignore_index`**: Determines whether indexes are ignored or not. If `False` (the default), index-columns will appear in the output, otherwise not. (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.) + * **`replace_na`**: For function `match_most_similar`, determines whether `NaN` values in index-columns are replaced or not by index-labels from `duplicates`. Defaults to `False`. (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.) + * **`include_zeroes`**: When `min_similarity` ≤ 0, determines whether zero-similarity matches appear in the output. Defaults to `True`. (See [tutorials/zero_similarity.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/zero_similarity.md).) **Note:** If `include_zeroes` is `True` and the kwarg `max_n_matches` is set then it must be sufficiently high to capture ***all*** nonzero-similarity-matches, otherwise an error is raised and `string_grouper` suggests an alternative value for `max_n_matches`. To allow `string_grouper` to automatically use the appropriate value for `max_n_matches` then do not set this kwarg at all. + * **`group_rep`**: For function `group_similar_strings`, determines how group-representatives are chosen. Allowed values are `'centroid'` (the default) and `'first'`. See [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for an explanation. ## Examples @@ -231,7 +231,7 @@ matches[matches['left_Company Name'] != matches['right_Company Name']].head() ### Find all matches in between two data sets. -The match_strings function finds similar items between two data sets as well. This can be seen as an inner join between two data sets: +The `match_strings` function finds similar items between two data sets as well. This can be seen as an inner join between two data sets: ```python @@ -301,11 +301,11 @@ matches -Out of the four company names in duplicates, three companies are found in the original company data set. One company is found three times. +Out of the four company names in `duplicates`, three companies are found in the original company data set. One company is found three times. ### Finding duplicates from a (database extract to) DataFrame where IDs for rows are supplied. -A very common scenario is the case where duplicate records for an entity have been entered into a database. That is, there are two or more records where a name field has slightly different spelling. For example, "A.B. Corporation" and "AB Corporation". Using the optional 'ID' parameter in the match_strings function duplicates can be found easily. A [tutorial](tutorials/tutorial_1.md) that steps though the process with an example data set is available. +A very common scenario is the case where duplicate records for an entity have been entered into a database. That is, there are two or more records where a name field has slightly different spelling. For example, "A.B. Corporation" and "AB Corporation". Using the optional 'ID' parameter in the `match_strings` function duplicates can be found easily. A [tutorial](https://github.com/Bergvca/string_grouper/blob/master/tutorials/tutorial_1.md) that steps though the process with an example data set is available. ### For a second data set, find only the most similar match @@ -362,7 +362,7 @@ pd.concat([new_companies, matches], axis=1) ### Deduplicate a single data set and show items with most duplicates -The group_similar_strings function groups strings that are similar using a single linkage clustering algorithm. That is, if item A and item B are similar; and item B and item C are similar; but the similarity between A and C is below the threshold; then all three items are grouped together. +The `group_similar_strings` function groups strings that are similar using a single linkage clustering algorithm. That is, if item A and item B are similar; and item B and item C are similar; but the similarity between A and C is below the threshold; then all three items are grouped together. ```python # Add the grouped strings: @@ -389,7 +389,7 @@ companies.groupby('deduplicated_name')['Line Number'].count().sort_values(ascend Name: Line Number, dtype: int64 -The group_similar_strings function also works with IDs: imagine a DataFrame (customers_df) with the following content: +The `group_similar_strings` function also works with IDs: imagine a `DataFrame` (`customers_df`) with the following content: ```python # Create a small set of artificial customer names: customers_df = pd.DataFrame( @@ -443,7 +443,7 @@ customers_df -The output of group_similar_strings can be directly used as a mapping table: +The output of `group_similar_strings` can be directly used as a mapping table: ```python # Group customers with similar names: customers_df[["group-id", "name_deduped"]] = \ @@ -503,11 +503,11 @@ customers_df -Note that here customers_df initially had only one column "Customer Name" (before the group_similar_strings function call); and it acquired two more columns "group-id" (the index-column) and "name_deduped" after the call through a "[setting with enlargement](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#setting-with-enlargement)" (a pandas feature). +Note that here `customers_df` initially had only one column "Customer Name" (before the `group_similar_strings` function call); and it acquired two more columns "group-id" (the index-column) and "name_deduped" after the call through a "[setting with enlargement](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#setting-with-enlargement)" (a `pandas` feature). ### Simply compute the cosine similarities of pairs of strings -Sometimes we have pairs of strings that have already been matched but whose similarity scores need to be computed. For this purpose we provide the function compute_pairwise_similarities: +Sometimes we have pairs of strings that have already been matched but whose similarity scores need to be computed. For this purpose we provide the function `compute_pairwise_similarities`: ```python # Create a small DataFrame of pairs of strings: @@ -640,14 +640,14 @@ pair_s ## The StringGrouper class -The four functions mentioned above all create a StringGrouper object behind the scenes and call different functions on it. The StringGrouper class keeps track of all tuples of similar strings and creates the groups out of these. Since matches are often not perfect, a common workflow is to: +The four functions mentioned above all create a `StringGrouper` object behind the scenes and call different functions on it. The `StringGrouper` class keeps track of all tuples of similar strings and creates the groups out of these. Since matches are often not perfect, a common workflow is to: 1. Create matches 2. Manually inspect the results 3. Add and remove matches where necessary 4. Create groups of similar strings -The StringGrouper class allows for this without having to re-calculate the cosine similarity matrix. See below for an example. +The `StringGrouper` class allows for this without having to re-calculate the cosine similarity matrix. See below for an example. ```python From c96ec50fe41e5b469fc30177c138389ef5bacab7 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Wed, 28 Apr 2021 12:12:19 +0200 Subject: [PATCH 12/29] rearranged code in string_grouper.py --- string_grouper/string_grouper.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index d3eb07c6..d4f38387 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -468,22 +468,6 @@ def _get_non_matches_list(self) -> pd.DataFrame: missing_pairs['similarity'] = 0 return missing_pairs - @staticmethod - def _symmetrize_matrix(AA: csr_matrix) -> csr_matrix: - A = AA.tolil() - r, c = A.nonzero() - A[c, r] = A[r, c] - return A.tocsr() - - @staticmethod - def _get_matches_list(matches: csr_matrix) -> pd.DataFrame: - """Returns a list of all the indices of matches""" - r, c = matches.nonzero() - matches_list = pd.DataFrame({'master_side': r.astype(np.int64), - 'dupe_side': c.astype(np.int64), - 'similarity': matches.data}) - return matches_list - def _get_nearest_matches(self, ignore_index=False, replace_na=False) -> Union[pd.DataFrame, pd.Series]: @@ -634,6 +618,21 @@ def _validate_replace_na_and_drop(self): "index if the number of index-levels does not equal the number of index-columns." ) + @staticmethod + def _symmetrize_matrix(AA: csr_matrix) -> csr_matrix: + A = AA.tolil() + r, c = A.nonzero() + A[c, r] = A[r, c] + return A.tocsr() + + @staticmethod + def _get_matches_list(matches: csr_matrix) -> pd.DataFrame: + """Returns a list of all the indices of matches""" + r, c = matches.nonzero() + return pd.DataFrame({'master_side': r.astype(np.int64), + 'dupe_side': c.astype(np.int64), + 'similarity': matches.data}) + @staticmethod def _make_symmetric(new_matches: pd.DataFrame) -> pd.DataFrame: columns_switched = pd.DataFrame({'master_side': new_matches.dupe_side, From 0f0b2c3207a0517f661d49d6ed874ec8dd26407a Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Thu, 29 Apr 2021 07:08:29 +0200 Subject: [PATCH 13/29] corrected optional_kwargs for awesome_cossim_dotn in _build_matches() so that return_best_topn is always True --- string_grouper/string_grouper.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index d4f38387..4ea5e380 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -436,13 +436,11 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix tf_idf_matrix_1 = master_matrix tf_idf_matrix_2 = duplicate_matrix.transpose() - optional_kwargs = dict() - if self._config.number_of_processes > 1: - optional_kwargs = { - 'return_best_topn': True, - 'use_threads': True, - 'n_jobs': self._config.number_of_processes - } + optional_kwargs = { + 'return_best_topn': True, + 'use_threads': self._config.number_of_processes > 1, + 'n_jobs': self._config.number_of_processes + } return awesome_cossim_topn( tf_idf_matrix_1, tf_idf_matrix_2, From 57c4122d3fa3af8882312fde71c939086dfbd5b8 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Thu, 29 Apr 2021 21:51:23 +0200 Subject: [PATCH 14/29] added scouting function that determines the amount of memory needed for the matrix-product results. Also discarded the entire C/C++ memory management extension array_wrappers. --- setup.py | 11 +- sparse_dot_topn/array_wrappers.pxd | 18 - sparse_dot_topn/array_wrappers.pyx | 73 --- sparse_dot_topn/awesome_cossim_topn.py | 182 ++----- sparse_dot_topn/example/comparison2.py | 296 ++++++----- sparse_dot_topn/sparse_dot_topn.pyx | 491 ++++++++---------- sparse_dot_topn/sparse_dot_topn_parallel.cpp | 164 ++---- sparse_dot_topn/sparse_dot_topn_parallel.h | 39 +- sparse_dot_topn/sparse_dot_topn_source.cpp | 158 ++---- sparse_dot_topn/sparse_dot_topn_source.h | 18 +- sparse_dot_topn/sparse_dot_topn_threaded.pyx | 56 +- .../test/test_awesome_cossim_topn.py | 46 +- string_grouper/string_grouper.py | 2 +- 13 files changed, 557 insertions(+), 997 deletions(-) delete mode 100644 sparse_dot_topn/array_wrappers.pxd delete mode 100644 sparse_dot_topn/array_wrappers.pyx diff --git a/setup.py b/setup.py index 577ed0d9..c47aa78b 100644 --- a/setup.py +++ b/setup.py @@ -29,15 +29,6 @@ def finalize_options(self): else: extra_compile_args = ['-std=c++0x', '-pthread', '-O3'] -array_wrappers_ext = Extension('sparse_dot_topn.array_wrappers', - sources=[ - './sparse_dot_topn/array_wrappers.pyx', - './sparse_dot_topn/sparse_dot_topn_source.cpp' - ], - extra_compile_args=extra_compile_args, - define_macros=[('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')], - language='c++') - original_ext = Extension('sparse_dot_topn.sparse_dot_topn', sources=[ './sparse_dot_topn/sparse_dot_topn.pyx', @@ -91,5 +82,5 @@ def finalize_options(self): , 'pandas>=0.25.3' ], cmdclass={'build_ext': my_build_ext}, - ext_modules=[array_wrappers_ext, original_ext, threaded_ext] + ext_modules=[original_ext, threaded_ext] ) diff --git a/sparse_dot_topn/array_wrappers.pxd b/sparse_dot_topn/array_wrappers.pxd deleted file mode 100644 index d77e41b3..00000000 --- a/sparse_dot_topn/array_wrappers.pxd +++ /dev/null @@ -1,18 +0,0 @@ -from libcpp.vector cimport vector - -# define a Cython array wrapper class to hold a C++ vector of ints, adhering to numpy's buffer protocol: -cdef class ArrayWrapper_int: - cdef int view_count - cdef vector[int] vec - cdef Py_ssize_t shape[2] - cdef Py_ssize_t strides[2] - - -# define a Cython array wrapper class to hold a C++ vector of doubles, adhering to numpy's buffer protocol: -cdef class ArrayWrapper_double: - cdef int view_count - cdef vector[double] vec - cdef Py_ssize_t shape[2] - cdef Py_ssize_t strides[2] - - diff --git a/sparse_dot_topn/array_wrappers.pyx b/sparse_dot_topn/array_wrappers.pyx deleted file mode 100644 index ee458629..00000000 --- a/sparse_dot_topn/array_wrappers.pyx +++ /dev/null @@ -1,73 +0,0 @@ -from cpython cimport Py_buffer -from libcpp.vector cimport vector - -# define a Cython array wrapper class to hold a C++ vector of ints, adhering to numpy's buffer protocol: -cdef class ArrayWrapper_int: - # constructor and destructor are fairly unimportant now since - # vec will be destroyed automatically. - - def __cinit__(self, vector[int]& data): - self.vec.swap(data) - self.view_count = 0 - - # now implement the buffer protocol for the class - # which makes it generally useful to anything that expects an array - def __getbuffer__(self, Py_buffer *buffer, int flags): - # relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class - cdef Py_ssize_t itemsize = sizeof(self.vec[0]) - - self.shape[1] = self.vec.size() - self.shape[0] = 1 - self.strides[1] = ( &(self.vec[1]) - &(self.vec[0])) - self.strides[0] = self.vec.size() * self.strides[1] - buffer.buf = &(self.vec[0]) - buffer.format = 'i' - buffer.internal = NULL - buffer.itemsize = itemsize - buffer.len = self.vec.size() * itemsize # product(shape) * itemsize - buffer.ndim = 2 - buffer.obj = self - buffer.readonly = 0 - buffer.shape = self.shape - buffer.strides = self.strides - buffer.suboffsets = NULL - self.view_count += 1 - - def __releasebuffer__(self, Py_buffer *buffer): - self.view_count -= 1 - - -# define a Cython array wrapper class to hold a C++ vector of doubles, adhering to numpy's buffer protocol: -cdef class ArrayWrapper_double: - # constructor and destructor are fairly unimportant now since - # vec will be destroyed automatically. - - def __cinit__(self, vector[double]& data): - self.vec.swap(data) - self.view_count = 0 - - # now implement the buffer protocol for the class - # which makes it generally useful to anything that expects an array - def __getbuffer__(self, Py_buffer *buffer, int flags): - # relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class - cdef Py_ssize_t itemsize = sizeof(self.vec[0]) - - self.shape[1] = self.vec.size() - self.shape[0] = 1 - self.strides[1] = ( &(self.vec[1]) - &(self.vec[0])) - self.strides[0] = self.vec.size() * self.strides[1] - buffer.buf = &(self.vec[0]) - buffer.format = 'd' - buffer.internal = NULL - buffer.itemsize = itemsize - buffer.len = self.vec.size() * itemsize # product(shape) * itemsize - buffer.ndim = 2 - buffer.obj = self - buffer.readonly = 0 - buffer.shape = self.shape - buffer.strides = self.strides - buffer.suboffsets = NULL - self.view_count += 1 - - def __releasebuffer__(self, Py_buffer *buffer): - self.view_count -= 1 diff --git a/sparse_dot_topn/awesome_cossim_topn.py b/sparse_dot_topn/awesome_cossim_topn.py index 48caaa57..baa14fbc 100644 --- a/sparse_dot_topn/awesome_cossim_topn.py +++ b/sparse_dot_topn/awesome_cossim_topn.py @@ -12,21 +12,13 @@ def awesome_cossim_topn( - A, - B, - ntop, - lower_bound=0, - use_threads=False, - n_jobs=1, - mem_manager_is_C=False, - return_best_topn=False - ): + A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1, scout_nnz=False, return_best_ntop=False): """ This function will return a matrix C in CSR format, where C = [sorted top n results > lower_bound for each row of A * B]. - If return_best_topn=True then best_topn + If return_best_ntop=True then best_ntop (the true maximum number of elements > lower_bound per row of A * B) - will also be returned in a tuple together with C as (C, best_topn). + will also be returned in a tuple together with C as (C, best_ntop). Input: A and B: two CSR matrices @@ -34,23 +26,22 @@ def awesome_cossim_topn( lower_bound: a threshold that the element of A*B must be greater than use_threads: use multi-thread or not n_jobs: number of thread, must be >= 1 - mem_manager_is_C: (default: False) this is mainly for testing purposes. if - True, will force memory management to be handed over to - C/C++. - return_best_topn: (default: False) if True, will return best_topn together - with C as a tuple: (C, best_topn) + scout_nnz: (default: False) this is mainly for testing purposes. if + True, will force a memory-size determination before computing + the results. + return_best_ntop: (default: False) if True, will return best_ntop together + with C as a tuple: (C, best_ntop) Output: - C: result matrix (returned alone, if return_best_topn=False) - best_topn: The true maximum number of elements > lower_bound per row of - A * B returned together with C as a tuple: (C, best_topn). It is - returned only if return_best_topn=True. + C: result matrix (returned alone, if return_best_ntop=False) + best_ntop: The true maximum number of elements > lower_bound per row of + A * B returned together with C as a tuple: (C, best_ntop). It is + returned only if return_best_ntop=True. N.B. if A and B are not in CSR format, they will be converted to CSR """ if not isspmatrix_csr(A): A = A.tocsr() - if not isspmatrix_csr(B): B = B.tocsr() @@ -71,7 +62,7 @@ def awesome_cossim_topn( indices = np.zeros(nnz_max, dtype=idx_dtype) data = np.zeros(nnz_max, dtype=A.dtype) output = csr_matrix((data, indices, indptr), shape=(M, N)) - if return_best_topn: + if return_best_ntop: return output, 0 else: return output @@ -81,146 +72,77 @@ def awesome_cossim_topn( try: indices = np.empty(nnz_max, dtype=idx_dtype) data = np.empty(nnz_max, dtype=A.dtype) - if mem_manager_is_C: raise MemoryError # This is mainly for testing purposes + if scout_nnz: raise MemoryError # This is mainly for testing purposes except MemoryError: - # if mem_manager_is_C: print('Exception raised! Continuing ...', flush=True) + # if scout_nnz: print('Exception raised! Continuing ...', flush=True) # It is likely you are here because nnz_max is too large. But don't give up just yet! - # sparse_dot_topn will hand over the memory allocation/management to C++. C++ will - # grow the memory allocations for these arrays as needed without any need for nnz_max. - # Note that reallocations could occur causing data to be copied to other locations - # in memory thus impacting performance - indices = np.empty(0, dtype=idx_dtype) - data = np.empty(0, dtype=A.dtype) + # sparse_dot_topn will go ahead and count the exact amount of memory required. if not use_threads: - - indices, data, best_topn = ct.sparse_dot_free( - M, N, np.asarray(A.indptr, dtype=idx_dtype), + + nnz = ct.sparse_dot_only_nnz(M, N, np.asarray(A.indptr, dtype=idx_dtype), np.asarray(A.indices, dtype=idx_dtype), A.data, np.asarray(B.indptr, dtype=idx_dtype), np.asarray(B.indices, dtype=idx_dtype), B.data, - ntop, lower_bound, - indptr + ntop, lower_bound ) else: - indices, data, best_topn = ct_thread.sparse_dot_free_threaded( + nnz = ct_thread.sparse_dot_only_nnz_threaded( M, N, np.asarray(A.indptr, dtype=idx_dtype), np.asarray(A.indices, dtype=idx_dtype), A.data, np.asarray(B.indptr, dtype=idx_dtype), np.asarray(B.indices, dtype=idx_dtype), B.data, - ntop, lower_bound, - indptr, n_jobs + ntop, lower_bound, n_jobs ) - - else: - # no exception was raised; then use old function (as it is expected to be the fastest) - - best_topn_arr = np.full(1, 0, dtype=idx_dtype) - - if not use_threads: + + nnz = max(1, nnz) + indices = np.empty(nnz, dtype=idx_dtype) + data = np.empty(nnz, dtype=A.dtype) - ct.sparse_dot_topn_extd( - M, N, np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - A.data, - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - B.data, - ntop, - lower_bound, - indptr, indices, data, best_topn_arr - ) + # no exception was raised; then use old function (as it is expected to be the fastest) - else: - if n_jobs < 1: - err_str = 'Whenever you select the multi-thread mode, n_job must be greater than or equal to 1!' - raise ValueError(err_str) + best_ntop_arr = np.full(1, 0, dtype=idx_dtype) - ct_thread.sparse_dot_topn_extd_threaded( - M, N, np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - A.data, - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - B.data, - ntop, - lower_bound, - indptr, indices, data, best_topn_arr, n_jobs - ) - - best_topn = best_topn_arr[0] - - # prepare and return the output: - output = csr_matrix((data, indices, indptr), shape=(M, N)) - if return_best_topn: - return output, best_topn - else: - return output - - -def awesome_cossim_only_max_nnz_col(A, B, use_threads=False, n_jobs=1): - """ - This function will return the maximum number of columns set - per row over all rows of A * B - - Input: - A and B: two CSR matrix - use_threads: use multi-thread or not - n_jobs: number of thread, must be >= 1 - - Output: - minmax_topn: maximum number of columns set - per row over all rows of A * B - - N.B. if A and B are not CSR format, they will be converted to CSR - """ - if not isspmatrix_csr(A): - A = A.tocsr() - - if not isspmatrix_csr(B): - B = B.tocsr() - - M, K1 = A.shape - K2, N = B.shape - - if K1 != K2: - err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!' - raise ValueError(err_str) - - idx_dtype = np.int32 - - minmax_topn = np.full(1, 0, dtype=idx_dtype) - - # basic check. if A or B are all zeros matrix, return 0 directly - if len(A.indices) == 0 or len(B.indices) == 0: - return 0 - if not use_threads: - - ct.sparse_dot_only_max_nnz_col( - M, N, - np.asarray(A.indptr, dtype=idx_dtype), + + ct.sparse_dot_topn_extd( + M, N, np.asarray(A.indptr, dtype=idx_dtype), np.asarray(A.indices, dtype=idx_dtype), + A.data, np.asarray(B.indptr, dtype=idx_dtype), np.asarray(B.indices, dtype=idx_dtype), - minmax_topn) + B.data, + ntop, + lower_bound, + indptr, indices, data, best_ntop_arr + ) else: if n_jobs < 1: - err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!' + err_str = 'Whenever you select the multi-thread mode, n_job must be greater than or equal to 1!' raise ValueError(err_str) - ct_thread.sparse_dot_only_max_nnz_col_threaded( - M, N, - np.asarray(A.indptr, dtype=idx_dtype), + ct_thread.sparse_dot_topn_extd_threaded( + M, N, np.asarray(A.indptr, dtype=idx_dtype), np.asarray(A.indices, dtype=idx_dtype), + A.data, np.asarray(B.indptr, dtype=idx_dtype), np.asarray(B.indices, dtype=idx_dtype), - minmax_topn, n_jobs) + B.data, + ntop, + lower_bound, + indptr, indices, data, best_ntop_arr, n_jobs + ) + + # prepare and return the output: + output = csr_matrix((data, indices, indptr), shape=(M, N)) + if return_best_ntop: + return output, best_ntop_arr[0] + else: + return output - return minmax_topn[0] diff --git a/sparse_dot_topn/example/comparison2.py b/sparse_dot_topn/example/comparison2.py index 7af5d08a..c54a2ff8 100644 --- a/sparse_dot_topn/example/comparison2.py +++ b/sparse_dot_topn/example/comparison2.py @@ -5,165 +5,177 @@ from __future__ import print_function import timeit import numpy as np +import pandas as pd from scipy.sparse import coo_matrix from sparse_dot_topn import awesome_cossim_topn # noqa: F401 +df = pd.DataFrame(columns=['sample', '#threads', 'python', '+scout', '%inc']) + N = 1000 thresh = 0.01 -nr_vocab = 2 << 24 -density = 1e-6 +nr_vocab = int(26**3) +density = 30/nr_vocab n_samples = 1000000 n_duplicates = N nnz_a = int(n_samples * nr_vocab * density) nnz_b = int(n_duplicates * nr_vocab * density) +print(f'ntop = {N}', flush=True) +print(f'threshold = {thresh}', flush=True) print(f'density = {density}', flush=True) print(f'nr_vocab = {nr_vocab}', flush=True) print(f'n_samples = {n_samples}', flush=True) print(f'n_duplicates = {n_duplicates}', flush=True) -print(f'nnz_a = {nnz_a}', flush=True) -print(f'nnz_b = {nnz_b}', flush=True) +print(f'nnz_A = {nnz_a}', flush=True) +print(f'nnz_B = {nnz_b}', flush=True) print('', flush=True) rng1 = np.random.RandomState(42) rng2 = np.random.RandomState(43) -row = rng1.randint(n_samples, size=nnz_a) -cols = rng2.randint(nr_vocab, size=nnz_a) -data = rng1.rand(nnz_a) - -a_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab)) -a = a_sparse.tocsr() - -row = rng1.randint(n_duplicates, size=nnz_b) -cols = rng2.randint(nr_vocab, size=nnz_b) -data = rng1.rand(nnz_b) - -b_sparse = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab)) -b = b_sparse.T.tocsr() - - -# top 5 results per row - -print("Non-parallelized sparse_dot_topn function") - -rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh)', - number=3, - globals=globals()) -rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, mem_manager_is_C=True)', - number=3, - globals=globals()) -print('python\t\tC/C++', flush=True) -print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) - -print("Threaded function with 1 thread") - -rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1)', - number=3, - globals=globals()) -rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1, mem_manager_is_C=True)', - number=3, - globals=globals()) -print('python\t\tC/C++', flush=True) -print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) - -print("Threaded function with 2 threads") - -rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2)', - number=3, - globals=globals()) -rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2, mem_manager_is_C=True)', - number=3, - globals=globals()) -print('python\t\tC/C++', flush=True) -print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) - -print("Threaded function with 3 threads") - -rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3)', - number=3, - globals=globals()) -rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3, mem_manager_is_C=True)', - number=3, - globals=globals()) -print('python\t\tC/C++', flush=True) -print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) - -print("Threaded function with 4 threads") - -rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4)', - number=3, - globals=globals()) -rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4, mem_manager_is_C=True)', - number=3, - globals=globals()) -print('python\t\tC/C++', flush=True) -print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) - -print("Threaded function with 5 threads") - -rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5)', - number=3, - globals=globals()) -rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5, mem_manager_is_C=True)', - number=3, - globals=globals()) -print('python\t\tC/C++', flush=True) -print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) - -print("Threaded function with 6 threads") - -rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6)', - number=3, - globals=globals()) -rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6, mem_manager_is_C=True)', - number=3, - globals=globals()) -print('python\t\tC/C++', flush=True) -print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) - -print("Threaded function with 7 threads") - -rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7)', - number=3, - globals=globals()) -rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7, mem_manager_is_C=True)', - number=3, - globals=globals()) -print('python\t\tC/C++', flush=True) -print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) - - -# use scipy and numpy function - - -def get_csr_ntop_idx_data(csr_row, ntop): - """ - Get list (row index, score) of the n top matches - """ - nnz = csr_row.getnnz() - if nnz == 0: - return None - elif nnz <= ntop: - result = zip(csr_row.indices, csr_row.data) - else: - arg_idx = np.argpartition(csr_row.data, -ntop)[-ntop:] - result = zip(csr_row.indices[arg_idx], csr_row.data[arg_idx]) - - return sorted(result, key=lambda x: -x[1]) - - -def scipy_cossim_top(A, B, ntop, lower_bound=0): - C = A.dot(B) - return [get_csr_ntop_idx_data(row, ntop) for row in C] - -# top 5 results per row which element is greater than 2 - - -print("Scipy+numpy original function") - -rtv = timeit.timeit('scipy_cossim_top(a, b, N, thresh)', - number=3, - globals=globals()) -print(rtv) +n_matrix_pairs = 2**4 +nnz_arr = np.full(n_matrix_pairs, 0) +ntop_arr = np.full(n_matrix_pairs, 0) +r = 0 +for it in range(n_matrix_pairs): + + row = rng1.randint(n_samples, size=nnz_a) + cols = rng2.randint(nr_vocab, size=nnz_a) + data = rng1.rand(nnz_a) + + a_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab)) + a = a_sparse.tocsr() + + row = rng1.randint(n_duplicates, size=nnz_b) + cols = rng2.randint(nr_vocab, size=nnz_b) + data = rng1.rand(nnz_b) + + b_sparse = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab)) + b = b_sparse.T.tocsr() + + C, C_ntop = awesome_cossim_topn(a, b, N, thresh, return_best_ntop=True) + print(f'nnz(A*B) = {len(C.data)}', flush=True) + print(f'ntop(A*B) = {C_ntop}', flush=True) + print('', flush=True) + nnz_arr[it] = len(C.data) + ntop_arr[it] = C_ntop + + + # top 5 results per row + + print("Non-parallelized sparse_dot_topn function") + + rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh)', + number=3, + globals=globals()) + rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, scout_nnz=True)', + number=3, + globals=globals()) + df.loc[r] = [it, 0, rtv, rtv2, 100.*(rtv2 - rtv)/rtv] + r += 1 + print('sample\t\tpython\t\t+scout', flush=True) + print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + + print("Threaded function with 1 thread") + + rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1)', + number=3, + globals=globals()) + rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1, scout_nnz=True)', + number=3, + globals=globals()) + df.loc[r] = [it, 1, rtv, rtv2, 100.*(rtv2 - rtv)/rtv] + r += 1 + print('sample\t\tpython\t\t+scout', flush=True) + print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + + print("Threaded function with 2 threads") + + rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2)', + number=3, + globals=globals()) + rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2, scout_nnz=True)', + number=3, + globals=globals()) + df.loc[r] = [it, 2, rtv, rtv2, 100.*(rtv2 - rtv)/rtv] + r += 1 + print('sample\t\tpython\t\t+scout', flush=True) + print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + + print("Threaded function with 3 threads") + + rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3)', + number=3, + globals=globals()) + rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3, scout_nnz=True)', + number=3, + globals=globals()) + df.loc[r] = [it, 3, rtv, rtv2, 100.*(rtv2 - rtv)/rtv] + r += 1 + print('sample\t\tpython\t\t+scout', flush=True) + print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + + print("Threaded function with 4 threads") + + rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4)', + number=3, + globals=globals()) + rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4, scout_nnz=True)', + number=3, + globals=globals()) + df.loc[r] = [it, 4, rtv, rtv2, 100.*(rtv2 - rtv)/rtv] + r += 1 + print('sample\t\tpython\t\t+scout', flush=True) + print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + + print("Threaded function with 5 threads") + + rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5)', + number=3, + globals=globals()) + rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5, scout_nnz=True)', + number=3, + globals=globals()) + df.loc[r] = [it, 5, rtv, rtv2, 100.*(rtv2 - rtv)/rtv] + r += 1 + print('sample\t\tpython\t\t+scout', flush=True) + print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + + print("Threaded function with 6 threads") + + rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6)', + number=3, + globals=globals()) + rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6, scout_nnz=True)', + number=3, + globals=globals()) + df.loc[r] = [it, 6, rtv, rtv2, 100.*(rtv2 - rtv)/rtv] + r += 1 + print('sample\t\tpython\t\t+scout', flush=True) + print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + + print("Threaded function with 7 threads") + + rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7)', + number=3, + globals=globals()) + rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7, scout_nnz=True)', + number=3, + globals=globals()) + df.loc[r] = [it, 7, rtv, rtv2, 100.*(rtv2 - rtv)/rtv] + r += 1 + print('sample\t\tpython\t\t+scout', flush=True) + print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + + print('') + print(f'nnz(A*B) = {nnz_arr[:(it + 1)].mean()} +/- {nnz_arr[:(it + 1)].std()}') + print(f'ntop(A*B) = {ntop_arr[:(it + 1)].mean()} +/- {ntop_arr[:(it + 1)].std()}') + print('') + df = df.astype({ + 'sample': np.int64, '#threads': np.int64, 'python': np.float64, '+scout': np.float64, '%inc': np.float64}) + results = df.groupby('#threads', as_index=True, sort=True)[['python', '+scout', '%inc']].mean() + + print(results) + print('') + print('') diff --git a/sparse_dot_topn/sparse_dot_topn.pyx b/sparse_dot_topn/sparse_dot_topn.pyx index 580c0f2f..9728c467 100644 --- a/sparse_dot_topn/sparse_dot_topn.pyx +++ b/sparse_dot_topn/sparse_dot_topn.pyx @@ -5,7 +5,7 @@ # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at# -# http://www.apache.org/licenses/LICENSE-2.0# +# http://www.apache.org/licenses/LICENSE-2.0# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,7 +20,6 @@ # distutils: language = c++ from libcpp.vector cimport vector -from array_wrappers cimport ArrayWrapper_int, ArrayWrapper_double cimport numpy as np import numpy as np @@ -30,283 +29,225 @@ np.import_array() cdef extern from "sparse_dot_topn_source.h": - cdef void sparse_dot_topn_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int topn, - double lower_bound, - int Cp[], - int Cj[], - double Cx[] - ); - - cdef void sparse_dot_topn_extd_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int topn, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], - int* nminmax - ); - - cdef void sparse_dot_free_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int ntop, - double lower_bound, - int Cp[], - vector[int]* Cj, - vector[double]* Cx, - int* n_minmax - ); - - cdef void sparse_dot_only_max_nnz_col_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - int Bp[], - int Bj[], - int* max_nnz_col - ); + cdef void sparse_dot_topn_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int topn, + double lower_bound, + int Cp[], + int Cj[], + double Cx[] + ); + + cdef void sparse_dot_topn_extd_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int topn, + double lower_bound, + int Cp[], + int Cj[], + double Cx[], + int* nminmax + ); + + cdef void sparse_dot_free_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int ntop, + double lower_bound, + int Cp[], + vector[int]* Cj, + vector[double]* Cx, + int* n_minmax + ); + + cdef int sparse_dot_only_nnz_source( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int ntop, + double lower_bound + ); cpdef sparse_dot_topn( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr, - np.ndarray[int, ndim=1] c_indices, - np.ndarray[double, ndim=1] c_data + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr, + np.ndarray[int, ndim=1] c_indices, + np.ndarray[double, ndim=1] c_data ): - """ - Cython glue function to call sparse_dot_topn C++ implementation - This function will return a matrix C in CSR format, where - C = [sorted top n results and results > lower_bound for each row of A * B] - - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix - - a_indptr, a_indices, a_data: CSR expression of A matrix - b_indptr, b_indices, b_data: CSR expression of B matrix - - ntop: n top results - lower_bound: a threshold that the element of A*B must greater than - - Output by reference: - c_indptr, c_indices, c_data: CSR expression of C matrix - - N.B. A and B must be CSR format!!! - The type of input numpy array must be aligned with types of C++ function arguments! - """ - - cdef int* Ap = &a_indptr[0] - cdef int* Aj = &a_indices[0] - cdef double* Ax = &a_data[0] - cdef int* Bp = &b_indptr[0] - cdef int* Bj = &b_indices[0] - cdef double* Bx = &b_data[0] - cdef int* Cp = &c_indptr[0] - cdef int* Cj = &c_indices[0] - cdef double* Cx = &c_data[0] - - sparse_dot_topn_source( - n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx - ) - return + """ + Cython glue function to call sparse_dot_topn C++ implementation + This function will return a matrix C in CSR format, where + C = [sorted top n results and results > lower_bound for each row of A * B] + + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix + + a_indptr, a_indices, a_data: CSR expression of A matrix + b_indptr, b_indices, b_data: CSR expression of B matrix + + ntop: n top results + lower_bound: a threshold that the element of A*B must greater than + + Output by reference: + c_indptr, c_indices, c_data: CSR expression of C matrix + + N.B. A and B must be CSR format!!! + The type of input numpy array must be aligned with types of C++ function arguments! + """ + + cdef int* Ap = &a_indptr[0] + cdef int* Aj = &a_indices[0] + cdef double* Ax = &a_data[0] + cdef int* Bp = &b_indptr[0] + cdef int* Bj = &b_indices[0] + cdef double* Bx = &b_data[0] + cdef int* Cp = &c_indptr[0] + cdef int* Cj = &c_indices[0] + cdef double* Cx = &c_data[0] + + sparse_dot_topn_source( + n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx + ) + return cpdef sparse_dot_topn_extd( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr, - np.ndarray[int, ndim=1] c_indices, - np.ndarray[double, ndim=1] c_data, - np.ndarray[int, ndim=1] nminmax, + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound, + np.ndarray[int, ndim=1] c_indptr, + np.ndarray[int, ndim=1] c_indices, + np.ndarray[double, ndim=1] c_data, + np.ndarray[int, ndim=1] nminmax, ): - """ - Cython glue function to call sparse_dot_topn_extd C++ - implementation. This function will return a matrix C in CSR - format, where - C = [sorted top n results > lower_bound for each row of A * B] - The maximum number nminmax of elements per row of C (assuming - n = number of columns of B) is also returned. - - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix - - a_indptr, a_indices, a_data: CSR expression of A matrix - b_indptr, b_indices, b_data: CSR expression of B matrix - - ntop: n, the number of topmost results > lower_bound for - each row of C - lower_bound: a threshold that the element of A*B must - greater than - - Output by reference: - c_indptr, c_indices, c_data: CSR expression of matrix C - nminmax: The maximum number of elements per row of C - (assuming ntop = n_col) - - N.B. A and B must be CSR format!!! - The type of input numpy array must be aligned with types - of C++ function arguments! - """ - - cdef int* Ap = &a_indptr[0] - cdef int* Aj = &a_indices[0] - cdef double* Ax = &a_data[0] - cdef int* Bp = &b_indptr[0] - cdef int* Bj = &b_indices[0] - cdef double* Bx = &b_data[0] - cdef int* Cp = &c_indptr[0] - cdef int* Cj = &c_indices[0] - cdef double* Cx = &c_data[0] - cdef int* n_minmax = &nminmax[0] - - sparse_dot_topn_extd_source( - n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax - ) - return - -cpdef sparse_dot_free( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr + """ + Cython glue function to call sparse_dot_topn_extd C++ + implementation. This function will return a matrix C in CSR + format, where + C = [sorted top n results > lower_bound for each row of A * B] + The maximum number nminmax of elements per row of C (assuming + n = number of columns of B) is also returned. + + Input: + n_row: number of rows of A matrix + n_col: number of columns of B matrix + + a_indptr, a_indices, a_data: CSR expression of A matrix + b_indptr, b_indices, b_data: CSR expression of B matrix + + ntop: n, the number of topmost results > lower_bound for + each row of C + lower_bound: a threshold that the element of A*B must + greater than + + Output by reference: + c_indptr, c_indices, c_data: CSR expression of matrix C + nminmax: The maximum number of elements per row of C + (assuming ntop = n_col) + + N.B. A and B must be CSR format!!! + The type of input numpy array must be aligned with types + of C++ function arguments! + """ + + cdef int* Ap = &a_indptr[0] + cdef int* Aj = &a_indices[0] + cdef double* Ax = &a_data[0] + cdef int* Bp = &b_indptr[0] + cdef int* Bj = &b_indices[0] + cdef double* Bx = &b_data[0] + cdef int* Cp = &c_indptr[0] + cdef int* Cj = &c_indices[0] + cdef double* Cx = &c_data[0] + cdef int* n_minmax = &nminmax[0] + + sparse_dot_topn_extd_source( + n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax + ) + return + + +cpdef sparse_dot_only_nnz( + int n_row, + int n_col, + np.ndarray[int, ndim=1] a_indptr, + np.ndarray[int, ndim=1] a_indices, + np.ndarray[double, ndim=1] a_data, + np.ndarray[int, ndim=1] b_indptr, + np.ndarray[int, ndim=1] b_indices, + np.ndarray[double, ndim=1] b_data, + int ntop, + double lower_bound ): - """ - Cython glue function to call sparse_dot_free C++ implementation - This function will return a matrix C in CSR format, where - C = [sorted top n results > lower_bound for each row of A * B] - The maximum number nminmax of elements per row of C (assuming - n = number of columns of B) is also returned. - - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix - - a_indptr, a_indices, a_data: CSR expression of A matrix - b_indptr, b_indices, b_data: CSR expression of B matrix - - ntop: n, the number of topmost results > lower_bound for - each row of C - lower_bound: a threshold that the element of A*B must - greater than - - Output by reference: - c_indptr: index-pointer of the CSR expression of matrix C - - Returned Output: - c_indices, c_data: indices and data of the CSR expression - of matrix C - nminmax: The maximum number of elements per row of C - (assuming ntop = n_col) - - N.B. A and B must be CSR format!!! - The type of input numpy array must be aligned with types - of C++ function arguments! - """ - - cdef int* Ap = &a_indptr[0] - cdef int* Aj = &a_indices[0] - cdef double* Ax = &a_data[0] - cdef int* Bp = &b_indptr[0] - cdef int* Bj = &b_indices[0] - cdef double* Bx = &b_data[0] - cdef int* Cp = &c_indptr[0] - cdef np.ndarray[int, ndim=1] nminmax = np.array([0], dtype=np.int32) - cdef int* n_minmax = &nminmax[0] - - cdef vector[int] vCj; - cdef vector[double] vCx; - - sparse_dot_free_source( - n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, &vCj, &vCx, n_minmax - ) - - c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0) - c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0) - - return c_indices, c_data, nminmax[0] - - -cpdef sparse_dot_only_max_nnz_col( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[int, ndim=1] minmax_topn -): - """ - Cython glue function to call sparse_dot_only_minmax_topn C++ implementation - This function will return the maximum number of columns set - per row over all rows of A * B - - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix - - a_indptr, a_indices: CSR indices of A matrix - b_indptr, b_indices: CSR indices of B matrix - - Output by reference: - minmax_ntop: the maximum number of columns set per row over all rows of - A * B - - N.B. A and B must be CSR format!!! - The type of input numpy array must be aligned with types of C++ function arguments! - """ - - cdef int* Ap = &a_indptr[0] - cdef int* Aj = &a_indices[0] - cdef int* Bp = &b_indptr[0] - cdef int* Bj = &b_indices[0] - cdef int* o_minmax_topn = &minmax_topn[0] - - sparse_dot_only_max_nnz_col_source(n_row, n_col, Ap, Aj, Bp, Bj, o_minmax_topn) - return + """ + Cython glue function to call sparse_dot_nnz_only C++ implementation + This function will return nnz, the total number of nonzero + matrix-components of + C = [top n results > lower_bound for each row of A * B]. + + Input: + a_indptr, a_indices, a_data: CSR expression of A matrix + b_indptr, b_indices, b_data: CSR expression of B matrix + + ntop: n, the number of topmost results > lower_bound for + each row of C + lower_bound: a threshold that the element of A*B must + greater than + + Returned output: + nnz: the total number of nonzero matrix-components of C + + N.B. A and B must be CSR format!!! + The type of input numpy array must be aligned with types of C++ function arguments! + """ + + cdef int* Ap = &a_indptr[0] + cdef int* Aj = &a_indices[0] + cdef double* Ax = &a_data[0] + cdef int* Bp = &b_indptr[0] + cdef int* Bj = &b_indices[0] + cdef double* Bx = &b_data[0] + + return sparse_dot_only_nnz_source( + n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound + ) diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/sparse_dot_topn/sparse_dot_topn_parallel.cpp index 1b06e927..8d8fadc6 100644 --- a/sparse_dot_topn/sparse_dot_topn_parallel.cpp +++ b/sparse_dot_topn/sparse_dot_topn_parallel.cpp @@ -170,7 +170,6 @@ void inner_sparse_dot_topn( *(row_nnz_ptr++) = len; (*total) += len; } - real_candidates->shrink_to_fit(); } void sparse_dot_topn_parallel( @@ -335,7 +334,6 @@ void inner_sparse_dot_topn_extd( *(row_nnz_ptr++) = len; (*total) += len; } - real_candidates->shrink_to_fit(); } void sparse_dot_topn_extd_parallel( @@ -412,147 +410,98 @@ void sparse_dot_topn_extd_parallel( thread_list[job_nr].join(); } -void sparse_dot_free_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], //data of A - int Bp[], - int Bj[], - double Bx[], //data of B - int ntop, - double lower_bound, - int Cp[], - std::vector* vCj, - std::vector* vCx, - int* n_minmax, - int n_jobs -) -{ - std::vector job_ranges(n_jobs); - distribute_load(n_row, n_jobs, job_ranges); - - std::vector> real_candidates(n_jobs); - std::vector> row_nnz(n_jobs); - - // initialize aggregates: - std::vector sub_total(n_jobs, 0); - std::vector split_n_minmax(n_jobs, 0); - - // execute the jobs: - std::vector thread_list(n_jobs); - for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - - thread_list[job_nr] = std::thread ( - inner_sparse_dot_topn_extd, - job_ranges[job_nr], - n_col, - ntop, lower_bound, - Ap, Aj, Ax, Bp, Bj, Bx, - &real_candidates[job_nr], - &row_nnz[job_nr], - &sub_total[job_nr], - &split_n_minmax[job_nr] - ); - } - - for (int job_nr = 0; job_nr < n_jobs; job_nr++) - thread_list[job_nr].join(); - - // gather the results (in parallel): - *n_minmax = *std::max_element(split_n_minmax.begin(), split_n_minmax.end()); - - std::vector start_points(n_jobs + 1); - start_points[0] = 0; - std::partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1); - - int total = start_points.back(); - vCj->resize(total); - vCj->shrink_to_fit(); - vCx->resize(total); - vCx->shrink_to_fit(); - - Cp[0] = 0; - for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - - thread_list[job_nr] = std::thread( - inner_gather_function, - job_ranges[job_nr], - Cp, - start_points[job_nr], - &((*vCj)[0]), - &((*vCx)[0]), - &real_candidates[job_nr], - &row_nnz[job_nr] - ); - } - - for (int job_nr = 0; job_nr < n_jobs; job_nr++) - thread_list[job_nr].join(); -} - -void inner_sparse_only_max_nnz_col( +void inner_sparse_nnz_only( job_range_type job_range, int n_col_inner, + int ntop_inner, + double lower_bound_inner, int Ap_copy[], int Aj_copy[], + double Ax_copy[], int Bp_copy[], int Bj_copy[], - int *max_nnz_col // already initialized to 0 + double Bx_copy[], + int* nnz ) { - std::vector unmarked(n_col_inner, true); + + std::vector next(n_col_inner,-1); + std::vector sums(n_col_inner, 0); for(int i = job_range.begin; i < job_range.end; i++){ + int head = -2; int length = 0; + int candidates_sz = 0; int jj_start = Ap_copy[i]; - int jj_end = Ap_copy[i+1]; + int jj_end = Ap_copy[i + 1]; for(int jj = jj_start; jj < jj_end; jj++){ int j = Aj_copy[jj]; + double v = Ax_copy[jj]; //value of A in (i,j) int kk_start = Bp_copy[j]; - int kk_end = Bp_copy[j+1]; + int kk_end = Bp_copy[j + 1]; for(int kk = kk_start; kk < kk_end; kk++){ int k = Bj_copy[kk]; //kth column of B in row j - if(unmarked[k]){ // if this k is not already marked then ... - unmarked[k] = false; // keep a record of column k + sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i + + if(next[k] == -1){ + next[k] = head; //keep a linked list, every element points to the next column index + head = k; length++; } } } - *max_nnz_col = (length > *max_nnz_col)? length : *max_nnz_col; + + for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) + + if(sums[head] > lower_bound_inner) candidates_sz++; + + int temp = head; + head = next[head]; //iterate over columns + + next[temp] = -1; //clear arrays + sums[temp] = 0; //clear arrays + } + + if (candidates_sz > ntop_inner) candidates_sz = ntop_inner; + + (*nnz) += candidates_sz; } } -void sparse_dot_only_max_nnz_col_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - int Bp[], - int Bj[], - int *max_nnz_col, - int n_jobs +int sparse_dot_only_nnz_parallel( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int ntop, + double lower_bound, + int n_jobs ) { - std::vector job_ranges(n_jobs); - distribute_load(n_row, n_jobs, job_ranges); + std::vector job_row_ranges(n_jobs); + distribute_load(n_row, n_jobs, job_row_ranges); - std::vector split_max_nnz_col(n_jobs, 0); + std::vector split_nnz(n_jobs, 0); std::vector thread_list(n_jobs); + for (int job_nr = 0; job_nr < n_jobs; job_nr++) { thread_list[job_nr] = std::thread ( - inner_sparse_only_max_nnz_col, - job_ranges[job_nr], + inner_sparse_nnz_only, + job_row_ranges[job_nr], n_col, - Ap, Aj, Bp, Bj, - &split_max_nnz_col[job_nr] + ntop, lower_bound, + Ap, Aj, Ax, Bp, Bj, Bx, + &split_nnz[job_nr] ); } @@ -560,5 +509,6 @@ void sparse_dot_only_max_nnz_col_parallel( for (int job_nr = 0; job_nr < n_jobs; job_nr++) thread_list[job_nr].join(); - *max_nnz_col = *max_element(split_max_nnz_col.begin(), split_max_nnz_col.end()); + return std::accumulate(split_nnz.begin(), split_nnz.end(), (int) 0); } + diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.h b/sparse_dot_topn/sparse_dot_topn_parallel.h index 716ca04e..0099917e 100644 --- a/sparse_dot_topn/sparse_dot_topn_parallel.h +++ b/sparse_dot_topn/sparse_dot_topn_parallel.h @@ -58,33 +58,18 @@ extern void sparse_dot_topn_extd_parallel( int n_jobs ); -extern void sparse_dot_free_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], //data of A - int Bp[], - int Bj[], - double Bx[], //data of B - int ntop, - double lower_bound, - int Cp[], - std::vector* Cj, - std::vector* Cx, - int* n_minmax, - int njobs -); - -extern void sparse_dot_only_max_nnz_col_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - int Bp[], - int Bj[], - int *max_nnz_col, - int n_jobs +extern int sparse_dot_only_nnz_parallel( + int n_row, + int n_col, + int Ap[], + int Aj[], + double Ax[], + int Bp[], + int Bj[], + double Bx[], + int ntop, + double lower_bound, + int n_jobs ); #endif //UTILS_CPPCLASS_H diff --git a/sparse_dot_topn/sparse_dot_topn_source.cpp b/sparse_dot_topn/sparse_dot_topn_source.cpp index 17b8b121..0cc14e62 100644 --- a/sparse_dot_topn/sparse_dot_topn_source.cpp +++ b/sparse_dot_topn/sparse_dot_topn_source.cpp @@ -247,12 +247,14 @@ void sparse_dot_topn_extd_source( } /* - C++ implementation of sparse_dot_free_source + C++ implementation of sparse_dot_nnz_source - This function will return a matrix C in CSR format, where - C = [sorted top n results > lower_bound for each row of A * B]. - The maximum number n_minmax of elements per row of C (assuming ntop = n_col) - is also returned. + This function will return the number nnz of nonzero elements + of the matrix C in CSR format, where + C = [all results > lower_bound sorted for each row of A * B] + and ntop the maximum number of elements per row of C. + This function is designed primarily to help with memory management for + very large sparse matrices. Input: n_row: number of rows of A matrix @@ -261,18 +263,15 @@ void sparse_dot_topn_extd_source( Ap, Aj, Ax: CSR expression of A matrix Bp, Bj, Bx: CSR expression of B matrix - ntop: n top results lower_bound: a threshold that the element of A*B must greater than - Output by reference: - Cp: C array for idx_pointer of CSR expression of C matrix - Cj: STL vector for indices of CSR expression of C matrix - Cx: STL vector for data values of CSR expression of C matrix - n_minmax: the maximum number of elements per row of C + Output: + nnz: number of nonzero elements of matrix C + ntop: maximum number of elements per row of C N.B. A and B must be CSR format!!! */ -void sparse_dot_free_source( +void sparse_dot_nnz_source( int n_row, int n_col, int Ap[], @@ -281,25 +280,16 @@ void sparse_dot_free_source( int Bp[], int Bj[], double Bx[], //data of B - int ntop, double lower_bound, - int Cp[], - std::vector* Cj, - std::vector* Cx, - int* n_minmax + int* nnz, + int* ntop ) { - *n_minmax = 0; - int sz = std::max(n_row, n_col); - Cj->reserve(sz); - Cx->reserve(sz); - std::vector next(n_col,-1); std::vector sums(n_col, 0); - std::vector candidates; - - Cp[0] = 0; + *nnz = 0; + *ntop = 0; for(int i = 0; i < n_row; i++){ int head = -2; @@ -326,14 +316,10 @@ void sparse_dot_free_source( } } + int nnz_k = 0; for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) - if(sums[head] > lower_bound){ //append the nonzero elements - candidate c; - c.index = head; - c.value = sums[head]; - candidates.push_back(c); - } + if(sums[head] > lower_bound) nnz_k++; //count this nonzero element in int temp = head; head = next[head]; //iterate over columns @@ -341,36 +327,17 @@ void sparse_dot_free_source( next[temp] = -1; //clear arrays sums[temp] = 0; //clear arrays } - - int len = (int)candidates.size(); - *n_minmax = (len > *n_minmax)? len : *n_minmax; - - if (len > ntop){ - std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp); - len = ntop; - } else { - std::sort(candidates.begin(), candidates.end(), candidate_cmp); - } - - for(int a=0; a < len; a++){ - Cj->push_back(candidates[a].index); - Cx->push_back(candidates[a].value); - } - candidates.clear(); - - Cp[i+1] = Cj->size(); + *ntop = (nnz_k > *ntop)? nnz_k : *ntop; + *nnz += nnz_k; } } /* - C++ implementation of sparse_dot_nnz_source + C++ implementation of sparse_dot_only_max_nnz_col_source - This function will return the number nnz of nonzero elements - of the matrix C in CSR format, where - C = [all results > lower_bound sorted for each row of A * B] - and ntop the maximum number of elements per row of C. - This function is designed primarily to help with memory management for - very large sparse matrices. + This function will return nnz, the total number of nonzero + matrix-components of + C = [top n results > lower_bound for each row of A * B]. Input: n_row: number of rows of A matrix @@ -379,15 +346,15 @@ void sparse_dot_free_source( Ap, Aj, Ax: CSR expression of A matrix Bp, Bj, Bx: CSR expression of B matrix + ntop: top n results lower_bound: a threshold that the element of A*B must greater than - Output: - nnz: number of nonzero elements of matrix C - ntop: maximum number of elements per row of C + Returned output: + nnz: the total number of nonzero matrix-components of C N.B. A and B must be CSR format!!! */ -void sparse_dot_nnz_source( +int sparse_dot_only_nnz_source( int n_row, int n_col, int Ap[], @@ -396,20 +363,19 @@ void sparse_dot_nnz_source( int Bp[], int Bj[], double Bx[], //data of B - double lower_bound, - int* nnz, - int* ntop + int ntop, + double lower_bound ) { std::vector next(n_col,-1); std::vector sums(n_col, 0); - *nnz = 0; - *ntop = 0; + int nnz = 0; for(int i = 0; i < n_row; i++){ int head = -2; int length = 0; + int candidates_sz = 0; int jj_start = Ap[i]; int jj_end = Ap[i+1]; @@ -432,10 +398,9 @@ void sparse_dot_nnz_source( } } - int nnz_k = 0; for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) - if(sums[head] > lower_bound) nnz_k++; //count this nonzero element in + if(sums[head] > lower_bound) candidates_sz++; int temp = head; head = next[head]; //iterate over columns @@ -443,63 +408,10 @@ void sparse_dot_nnz_source( next[temp] = -1; //clear arrays sums[temp] = 0; //clear arrays } - *ntop = (nnz_k > *ntop)? nnz_k : *ntop; - *nnz += nnz_k; - } -} -/* - C++ implementation of sparse_dot_only_max_nnz_col_source + if (candidates_sz > ntop) candidates_sz = ntop; - This function will return the maximum number of columns set - per row over all rows of A * B - - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix - - Ap, Aj, Ax: CSR expression of A matrix - Bp, Bj, Bx: CSR expression of B matrix - - Output by reference: - max_nnz_col: the maximum number of columns set per row - over all rows of A * B - - N.B. A and B must be CSR format!!! -*/ -void sparse_dot_only_max_nnz_col_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - int Bp[], - int Bj[], - int *max_nnz_col -) -{ - std::vector unmarked(n_col, true); - - *max_nnz_col = 0; - - for(int i = 0; i < n_row; i++){ - int length = 0; - - int jj_start = Ap[i]; - int jj_end = Ap[i+1]; - for(int jj = jj_start; jj < jj_end; jj++){ - int j = Aj[jj]; - - int kk_start = Bp[j]; - int kk_end = Bp[j+1]; - for(int kk = kk_start; kk < kk_end; kk++){ - int k = Bj[kk]; // kth column of B in row j - - if(unmarked[k]){ // if this k is not already marked then ... - unmarked[k] = false; // keep a record of column k - length++; - } - } - } - *max_nnz_col = (length > *max_nnz_col)? length : *max_nnz_col; + nnz += candidates_sz; } + return nnz; } diff --git a/sparse_dot_topn/sparse_dot_topn_source.h b/sparse_dot_topn/sparse_dot_topn_source.h index 9580d1cf..7975a75b 100644 --- a/sparse_dot_topn/sparse_dot_topn_source.h +++ b/sparse_dot_topn/sparse_dot_topn_source.h @@ -61,7 +61,7 @@ extern void sparse_dot_topn_extd_source( int* n_minmax ); -extern void sparse_dot_free_source( +extern int sparse_dot_only_nnz_source( int n_row, int n_col, int Ap[], @@ -71,21 +71,7 @@ extern void sparse_dot_free_source( int Bj[], double Bx[], //data of B int ntop, - double lower_bound, - int Cp[], - std::vector* Cj, - std::vector* Cx, - int* n_minmax -); - -extern void sparse_dot_only_max_nnz_col_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - int Bp[], - int Bj[], - int *max_nnz_col + double lower_bound ); #endif //UTILS_CPPCLASS_H diff --git a/sparse_dot_topn/sparse_dot_topn_threaded.pyx b/sparse_dot_topn/sparse_dot_topn_threaded.pyx index 84999abc..ad95fbb9 100644 --- a/sparse_dot_topn/sparse_dot_topn_threaded.pyx +++ b/sparse_dot_topn/sparse_dot_topn_threaded.pyx @@ -20,7 +20,6 @@ # distutils: language = c++ from libcpp.vector cimport vector -from array_wrappers cimport ArrayWrapper_int, ArrayWrapper_double cimport numpy as np import numpy as np @@ -66,7 +65,7 @@ cdef extern from "sparse_dot_topn_parallel.h": int n_jobs ); - cdef void sparse_dot_free_parallel( + cdef int sparse_dot_only_nnz_parallel( int n_row, int n_col, int Ap[], @@ -77,21 +76,6 @@ cdef extern from "sparse_dot_topn_parallel.h": double Bx[], int ntop, double lower_bound, - int Cp[], - vector[int]* Cj, - vector[double]* Cx, - int* n_minmax, - int n_jobs - ); - - cdef void sparse_dot_only_max_nnz_col_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - int Bp[], - int Bj[], - int* max_nnz_col, int n_jobs ); @@ -161,7 +145,7 @@ cpdef sparse_dot_topn_extd_threaded( ) return -cpdef sparse_dot_free_threaded( +cpdef sparse_dot_only_nnz_threaded( int n_row, int n_col, np.ndarray[int, ndim=1] a_indptr, @@ -172,7 +156,6 @@ cpdef sparse_dot_free_threaded( np.ndarray[double, ndim=1] b_data, int ntop, double lower_bound, - np.ndarray[int, ndim=1] c_indptr, int n_jobs ): @@ -182,38 +165,7 @@ cpdef sparse_dot_free_threaded( cdef int* Bp = &b_indptr[0] cdef int* Bj = &b_indices[0] cdef double* Bx = &b_data[0] - cdef int* Cp = &c_indptr[0] - cdef np.ndarray[int, ndim=1] nminmax = np.array([0], dtype=np.int32) - cdef int* n_minmax = &nminmax[0] - cdef vector[int] vCj; - cdef vector[double] vCx; - - sparse_dot_free_parallel( - n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, &vCj, &vCx, n_minmax, n_jobs + return sparse_dot_only_nnz_parallel( + n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, n_jobs ) - - c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0) - c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0) - - return c_indices, c_data, nminmax[0] - -cpdef sparse_dot_only_max_nnz_col_threaded( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[int, ndim=1] max_nnz_col, - int n_jobs -): - - cdef int* Ap = &a_indptr[0] - cdef int* Aj = &a_indices[0] - cdef int* Bp = &b_indptr[0] - cdef int* Bj = &b_indices[0] - cdef int* o_max_nnz_col = &max_nnz_col[0] - - sparse_dot_only_max_nnz_col_parallel(n_row, n_col, Ap, Aj, Bp, Bj, o_max_nnz_col, n_jobs) - return diff --git a/sparse_dot_topn/test/test_awesome_cossim_topn.py b/sparse_dot_topn/test/test_awesome_cossim_topn.py index ba7dfbfc..1a631179 100644 --- a/sparse_dot_topn/test/test_awesome_cossim_topn.py +++ b/sparse_dot_topn/test/test_awesome_cossim_topn.py @@ -11,7 +11,7 @@ PRUNE_THRESHOLD = 0.1 NUM_CANDIDATES = 3 -MEM_MANAGER_IS_C = True +SCOUT_NNZ = True USE_THREADS = True MAX_N_PROCESSES = min(8, multiprocessing.cpu_count()) - 1 @@ -38,7 +38,7 @@ def get_n_top_sparse(mat, n_top=10): def helper_awesome_cossim_topn_dense( a_dense, b_dense, - mem_manager_is_C=False, + scout_nnz=False, use_threads=False, n_jobs=1 ): @@ -58,7 +58,7 @@ def helper_awesome_cossim_topn_dense( awesome_result = awesome_cossim_topn( a_csr, b_csr_t, len(b_dense), 0.0, - mem_manager_is_C=mem_manager_is_C, + scout_nnz=scout_nnz, use_threads=use_threads, n_jobs=n_jobs ) @@ -67,7 +67,7 @@ def helper_awesome_cossim_topn_dense( b_csr_t, NUM_CANDIDATES, 0.0, - mem_manager_is_C=mem_manager_is_C, + scout_nnz=scout_nnz, use_threads=use_threads, n_jobs=n_jobs ) @@ -79,7 +79,7 @@ def helper_awesome_cossim_topn_dense( b_csr_t, len(b_dense), PRUNE_THRESHOLD, - mem_manager_is_C=mem_manager_is_C, + scout_nnz=scout_nnz, use_threads=use_threads, n_jobs=n_jobs ) @@ -88,7 +88,7 @@ def helper_awesome_cossim_topn_dense( b_csr_t, NUM_CANDIDATES, PRUNE_THRESHOLD, - mem_manager_is_C=mem_manager_is_C, + scout_nnz=scout_nnz, use_threads=use_threads, n_jobs=n_jobs ) @@ -119,7 +119,7 @@ def helper_awesome_cossim_topn_sparse( a_sparse, b_sparse, flag=True, - mem_manager_is_C=False, + scout_nnz=False, use_threads=False, n_jobs=1 ): @@ -141,7 +141,7 @@ def helper_awesome_cossim_topn_sparse( b_csr_t, b_sparse.shape[0], 0.0, - mem_manager_is_C=mem_manager_is_C, + scout_nnz=scout_nnz, use_threads=use_threads, n_jobs=n_jobs ) @@ -150,7 +150,7 @@ def helper_awesome_cossim_topn_sparse( b_csr_t, NUM_CANDIDATES, 0.0, - mem_manager_is_C=mem_manager_is_C, + scout_nnz=scout_nnz, use_threads=use_threads, n_jobs=n_jobs ) @@ -162,7 +162,7 @@ def helper_awesome_cossim_topn_sparse( b_csr_t, b_sparse.shape[0], PRUNE_THRESHOLD, - mem_manager_is_C=mem_manager_is_C, + scout_nnz=scout_nnz, use_threads=use_threads, n_jobs=n_jobs ) @@ -171,7 +171,7 @@ def helper_awesome_cossim_topn_sparse( b_csr_t, NUM_CANDIDATES, PRUNE_THRESHOLD, - mem_manager_is_C=mem_manager_is_C, + scout_nnz=scout_nnz, use_threads=use_threads, n_jobs=n_jobs ) @@ -216,14 +216,14 @@ def test_awesome_cossim_topn_manually(): [0.6, 0.1, 0.2, 0.8, 0.1], [0.9, 0.1, 0.6, 0.4, 0.3]] helper_awesome_cossim_topn_dense(a_dense, b_dense) - helper_awesome_cossim_topn_dense(a_dense, b_dense, mem_manager_is_C=MEM_MANAGER_IS_C) + helper_awesome_cossim_topn_dense(a_dense, b_dense, scout_nnz=SCOUT_NNZ) for process in range(MAX_N_PROCESSES): n_jobs = process + 1 helper_awesome_cossim_topn_dense(a_dense, b_dense, use_threads=USE_THREADS, n_jobs=n_jobs) helper_awesome_cossim_topn_dense( a_dense, b_dense, - mem_manager_is_C=MEM_MANAGER_IS_C, + scout_nnz=SCOUT_NNZ, use_threads=USE_THREADS, n_jobs=n_jobs ) @@ -240,14 +240,14 @@ def test_awesome_cossim_topn_manually(): [0, 0, 0, 0.1, 0.3], [0, 0, 0, 0.7, 0.5]] helper_awesome_cossim_topn_dense(c_dense, d_dense) - helper_awesome_cossim_topn_dense(c_dense, d_dense, mem_manager_is_C=MEM_MANAGER_IS_C) + helper_awesome_cossim_topn_dense(c_dense, d_dense, scout_nnz=SCOUT_NNZ) for process in range(MAX_N_PROCESSES): n_jobs = process + 1 helper_awesome_cossim_topn_dense(c_dense, d_dense, use_threads=USE_THREADS, n_jobs=n_jobs) helper_awesome_cossim_topn_dense( c_dense, d_dense, - mem_manager_is_C=MEM_MANAGER_IS_C, + scout_nnz=SCOUT_NNZ, use_threads=USE_THREADS, n_jobs=n_jobs ) @@ -264,14 +264,14 @@ def test_awesome_cossim_top_one_zeros(): a_sparse = csr_matrix(np.zeros((1, nr_vocab))) b_sparse = rand(800, nr_vocab, density=density, format='csr') helper_awesome_cossim_topn_sparse(a_sparse, b_sparse) - helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, mem_manager_is_C=MEM_MANAGER_IS_C) + helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, scout_nnz=SCOUT_NNZ) for process in range(MAX_N_PROCESSES): n_jobs = process + 1 helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, use_threads=USE_THREADS, n_jobs=n_jobs) helper_awesome_cossim_topn_sparse( a_sparse, b_sparse, - mem_manager_is_C=MEM_MANAGER_IS_C, + scout_nnz=SCOUT_NNZ, use_threads=USE_THREADS, n_jobs=n_jobs ) @@ -288,14 +288,14 @@ def test_awesome_cossim_top_all_zeros(): a_sparse = csr_matrix(np.zeros((2, nr_vocab))) b_sparse = rand(800, nr_vocab, density=density, format='csr') helper_awesome_cossim_topn_sparse(a_sparse, b_sparse) - helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, mem_manager_is_C=MEM_MANAGER_IS_C) + helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, scout_nnz=SCOUT_NNZ) for process in range(MAX_N_PROCESSES): n_jobs = process + 1 helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, use_threads=USE_THREADS, n_jobs=n_jobs) helper_awesome_cossim_topn_sparse( a_sparse, b_sparse, - mem_manager_is_C=MEM_MANAGER_IS_C, + scout_nnz=SCOUT_NNZ, use_threads=USE_THREADS, n_jobs=n_jobs ) @@ -311,7 +311,7 @@ def test_awesome_cossim_top_small_matrix(): a_sparse = rand(300, nr_vocab, density=density, format='csr') b_sparse = rand(800, nr_vocab, density=density, format='csr') helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False) - helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, mem_manager_is_C=MEM_MANAGER_IS_C) + helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, scout_nnz=SCOUT_NNZ) for process in range(MAX_N_PROCESSES): n_jobs = process + 1 helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, use_threads=USE_THREADS, n_jobs=n_jobs) @@ -319,7 +319,7 @@ def test_awesome_cossim_top_small_matrix(): a_sparse, b_sparse, False, - mem_manager_is_C=MEM_MANAGER_IS_C, + scout_nnz=SCOUT_NNZ, use_threads=USE_THREADS, n_jobs=n_jobs ) @@ -360,7 +360,7 @@ def test_awesome_cossim_top_large_matrix(): b_sparse = b_sparse.tocsr() helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False) - helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, mem_manager_is_C=MEM_MANAGER_IS_C) + helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, scout_nnz=SCOUT_NNZ) for process in range(MAX_N_PROCESSES): n_jobs = process + 1 helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, use_threads=USE_THREADS, n_jobs=n_jobs) @@ -368,7 +368,7 @@ def test_awesome_cossim_top_large_matrix(): a_sparse, b_sparse, False, - mem_manager_is_C=MEM_MANAGER_IS_C, + scout_nnz=SCOUT_NNZ, use_threads=USE_THREADS, n_jobs=n_jobs ) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index 4ea5e380..61903d5f 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -437,7 +437,7 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix tf_idf_matrix_2 = duplicate_matrix.transpose() optional_kwargs = { - 'return_best_topn': True, + 'return_best_ntop': True, 'use_threads': self._config.number_of_processes > 1, 'n_jobs': self._config.number_of_processes } From 6b7ee4b84d912bcdcf32afe5c47d37a8f3353419 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Sun, 2 May 2021 23:16:12 +0200 Subject: [PATCH 15/29] introduced heuristic to reduce over-estimate of memory allocation for the matrix product --- setup.py | 10 ++- sparse_dot_topn/array_wrappers.pxd | 16 ++++ sparse_dot_topn/array_wrappers.pyx | 73 +++++++++++++++ sparse_dot_topn/awesome_cossim_topn.py | 71 ++++++--------- sparse_dot_topn/example/comparison2.py | 88 +++++++------------ sparse_dot_topn/example/comparison3.py | 61 +++++++++++++ sparse_dot_topn/sparse_dot_topn.pyx | 56 +++++++----- sparse_dot_topn/sparse_dot_topn_parallel.cpp | 28 +++++- sparse_dot_topn/sparse_dot_topn_parallel.h | 5 +- sparse_dot_topn/sparse_dot_topn_source.cpp | 40 +++++++-- sparse_dot_topn/sparse_dot_topn_source.h | 5 +- sparse_dot_topn/sparse_dot_topn_threaded.pyx | 29 ++++-- .../test/test_awesome_cossim_topn.py | 61 ------------- 13 files changed, 342 insertions(+), 201 deletions(-) create mode 100644 sparse_dot_topn/array_wrappers.pxd create mode 100644 sparse_dot_topn/array_wrappers.pyx create mode 100644 sparse_dot_topn/example/comparison3.py diff --git a/setup.py b/setup.py index c47aa78b..cf5d5fee 100644 --- a/setup.py +++ b/setup.py @@ -29,6 +29,14 @@ def finalize_options(self): else: extra_compile_args = ['-std=c++0x', '-pthread', '-O3'] +array_wrappers_ext = Extension('sparse_dot_topn.array_wrappers', + sources=[ + './sparse_dot_topn/array_wrappers.pyx', + './sparse_dot_topn/sparse_dot_topn_source.cpp' + ], + extra_compile_args=extra_compile_args, + language='c++') + original_ext = Extension('sparse_dot_topn.sparse_dot_topn', sources=[ './sparse_dot_topn/sparse_dot_topn.pyx', @@ -82,5 +90,5 @@ def finalize_options(self): , 'pandas>=0.25.3' ], cmdclass={'build_ext': my_build_ext}, - ext_modules=[original_ext, threaded_ext] + ext_modules=[array_wrappers_ext, original_ext, threaded_ext], ) diff --git a/sparse_dot_topn/array_wrappers.pxd b/sparse_dot_topn/array_wrappers.pxd new file mode 100644 index 00000000..3af1a3c4 --- /dev/null +++ b/sparse_dot_topn/array_wrappers.pxd @@ -0,0 +1,16 @@ +from libcpp.vector cimport vector + +# define a Cython array wrapper class to hold a C++ vector of ints, adhering to numpy's buffer protocol: +cdef class ArrayWrapper_int: + cdef int view_count + cdef vector[int] vec + cdef Py_ssize_t shape[2] + cdef Py_ssize_t strides[2] + + +# define a Cython array wrapper class to hold a C++ vector of doubles, adhering to numpy's buffer protocol: +cdef class ArrayWrapper_double: + cdef int view_count + cdef vector[double] vec + cdef Py_ssize_t shape[2] + cdef Py_ssize_t strides[2] diff --git a/sparse_dot_topn/array_wrappers.pyx b/sparse_dot_topn/array_wrappers.pyx new file mode 100644 index 00000000..18525766 --- /dev/null +++ b/sparse_dot_topn/array_wrappers.pyx @@ -0,0 +1,73 @@ +from cpython cimport Py_buffer +from libcpp.vector cimport vector + +# define a Cython array wrapper class to hold a C++ vector of ints, adhering to numpy's buffer protocol: +cdef class ArrayWrapper_int: + # constructor and destructor are fairly unimportant now since + # vec will be destroyed automatically. + + def __cinit__(self, vector[int]& data): + self.vec.swap(data) + self.view_count = 0 + + # now implement the buffer protocol for the class + # which makes it generally useful to anything that expects an array + def __getbuffer__(self, Py_buffer *buffer, int flags): + # relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class + cdef Py_ssize_t itemsize = sizeof(self.vec[0]) + + self.shape[1] = self.vec.size() + self.shape[0] = 1 + self.strides[1] = ( &(self.vec[1]) - &(self.vec[0])) + self.strides[0] = self.vec.size() * self.strides[1] + buffer.buf = &(self.vec[0]) + buffer.format = 'i' + buffer.internal = NULL + buffer.itemsize = itemsize + buffer.len = self.vec.size() * itemsize # product(shape) * itemsize + buffer.ndim = 2 + buffer.obj = self + buffer.readonly = 0 + buffer.shape = self.shape + buffer.strides = self.strides + buffer.suboffsets = NULL + self.view_count += 1 + + def __releasebuffer__(self, Py_buffer *buffer): + self.view_count -= 1 + + +# define a Cython array wrapper class to hold a C++ vector of doubles, adhering to numpy's buffer protocol: +cdef class ArrayWrapper_double: + # constructor and destructor are fairly unimportant now since + # vec will be destroyed automatically. + + def __cinit__(self, vector[double]& data): + self.vec.swap(data) + self.view_count = 0 + + # now implement the buffer protocol for the class + # which makes it generally useful to anything that expects an array + def __getbuffer__(self, Py_buffer *buffer, int flags): + # relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class + cdef Py_ssize_t itemsize = sizeof(self.vec[0]) + + self.shape[1] = self.vec.size() + self.shape[0] = 1 + self.strides[1] = ( &(self.vec[1]) - &(self.vec[0])) + self.strides[0] = self.vec.size() * self.strides[1] + buffer.buf = &(self.vec[0]) + buffer.format = 'd' + buffer.internal = NULL + buffer.itemsize = itemsize + buffer.len = self.vec.size() * itemsize # product(shape) * itemsize + buffer.ndim = 2 + buffer.obj = self + buffer.readonly = 0 + buffer.shape = self.shape + buffer.strides = self.strides + buffer.suboffsets = NULL + self.view_count += 1 + + def __releasebuffer__(self, Py_buffer *buffer): + self.view_count -= 1 diff --git a/sparse_dot_topn/awesome_cossim_topn.py b/sparse_dot_topn/awesome_cossim_topn.py index baa14fbc..380c6e6e 100644 --- a/sparse_dot_topn/awesome_cossim_topn.py +++ b/sparse_dot_topn/awesome_cossim_topn.py @@ -2,6 +2,8 @@ import numpy as np from scipy.sparse import csr_matrix from scipy.sparse import isspmatrix_csr +from _ast import Continue +from numpy import indices if sys.version_info[0] >= 3: from sparse_dot_topn import sparse_dot_topn as ct @@ -12,7 +14,7 @@ def awesome_cossim_topn( - A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1, scout_nnz=False, return_best_ntop=False): + A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1, return_best_ntop=False): """ This function will return a matrix C in CSR format, where C = [sorted top n results > lower_bound for each row of A * B]. @@ -26,9 +28,6 @@ def awesome_cossim_topn( lower_bound: a threshold that the element of A*B must be greater than use_threads: use multi-thread or not n_jobs: number of thread, must be >= 1 - scout_nnz: (default: False) this is mainly for testing purposes. if - True, will force a memory-size determination before computing - the results. return_best_ntop: (default: False) if True, will return best_ntop together with C as a tuple: (C, best_ntop) @@ -40,6 +39,14 @@ def awesome_cossim_topn( N.B. if A and B are not in CSR format, they will be converted to CSR """ + def try_malloc(sz: int, idx_dtype, data_dtype) -> bool: + try: + ind_arr = np.empty(sz, dtype=idx_dtype) + dat_arr = np.empty(sz, dtype=data_dtype) + return True + except MemoryError: + return False + if not isspmatrix_csr(A): A = A.tocsr() if not isspmatrix_csr(B): @@ -67,50 +74,24 @@ def awesome_cossim_topn( else: return output - # filled matrices from here on - indptr = np.empty(M+1, dtype=idx_dtype) - try: - indices = np.empty(nnz_max, dtype=idx_dtype) - data = np.empty(nnz_max, dtype=A.dtype) - if scout_nnz: raise MemoryError # This is mainly for testing purposes - except MemoryError: - # if scout_nnz: print('Exception raised! Continuing ...', flush=True) - # It is likely you are here because nnz_max is too large. But don't give up just yet! - # sparse_dot_topn will go ahead and count the exact amount of memory required. - if not use_threads: - - nnz = ct.sparse_dot_only_nnz(M, N, np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - A.data, - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - B.data, - ntop, lower_bound - ) - - else: + indptr = np.empty(M + 1, dtype=idx_dtype) + + # reduce nnz_max if too large to fit in available memory: + while (not try_malloc(nnz_max, idx_dtype, A.dtype)): + nnz_max = nnz_max//2 - nnz = ct_thread.sparse_dot_only_nnz_threaded( - M, N, np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - A.data, - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - B.data, - ntop, lower_bound, n_jobs - ) - - nnz = max(1, nnz) - indices = np.empty(nnz, dtype=idx_dtype) - data = np.empty(nnz, dtype=A.dtype) - - # no exception was raised; then use old function (as it is expected to be the fastest) + # take a chance on high matrix-sparsity and reduce further: + nnz_max = max(M, nnz_max//16) + + # filled matrices from here on + indices = np.empty(nnz_max, dtype=idx_dtype) + data = np.empty(nnz_max, dtype=A.dtype) best_ntop_arr = np.full(1, 0, dtype=idx_dtype) if not use_threads: - ct.sparse_dot_topn_extd( + alt_indices, alt_data = ct.sparse_dot_topn_extd( M, N, np.asarray(A.indptr, dtype=idx_dtype), np.asarray(A.indices, dtype=idx_dtype), A.data, @@ -127,7 +108,7 @@ def awesome_cossim_topn( err_str = 'Whenever you select the multi-thread mode, n_job must be greater than or equal to 1!' raise ValueError(err_str) - ct_thread.sparse_dot_topn_extd_threaded( + alt_indices, alt_data = ct_thread.sparse_dot_topn_extd_threaded( M, N, np.asarray(A.indptr, dtype=idx_dtype), np.asarray(A.indices, dtype=idx_dtype), A.data, @@ -139,6 +120,10 @@ def awesome_cossim_topn( indptr, indices, data, best_ntop_arr, n_jobs ) + if alt_indices is not None: + indices = alt_indices + data = alt_data + # prepare and return the output: output = csr_matrix((data, indices, indptr), shape=(M, N)) if return_best_ntop: diff --git a/sparse_dot_topn/example/comparison2.py b/sparse_dot_topn/example/comparison2.py index c54a2ff8..c79cb45f 100644 --- a/sparse_dot_topn/example/comparison2.py +++ b/sparse_dot_topn/example/comparison2.py @@ -9,9 +9,9 @@ from scipy.sparse import coo_matrix from sparse_dot_topn import awesome_cossim_topn # noqa: F401 -df = pd.DataFrame(columns=['sample', '#threads', 'python', '+scout', '%inc']) +df = pd.DataFrame(columns=['sample', '#threads', 'python']) -N = 1000 +N = 4000 thresh = 0.01 nr_vocab = int(26**3) @@ -32,7 +32,6 @@ print('', flush=True) rng1 = np.random.RandomState(42) -rng2 = np.random.RandomState(43) n_matrix_pairs = 2**4 nnz_arr = np.full(n_matrix_pairs, 0) @@ -41,14 +40,14 @@ for it in range(n_matrix_pairs): row = rng1.randint(n_samples, size=nnz_a) - cols = rng2.randint(nr_vocab, size=nnz_a) + cols = rng1.randint(nr_vocab, size=nnz_a) data = rng1.rand(nnz_a) a_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab)) a = a_sparse.tocsr() row = rng1.randint(n_duplicates, size=nnz_b) - cols = rng2.randint(nr_vocab, size=nnz_b) + cols = rng1.randint(nr_vocab, size=nnz_b) data = rng1.rand(nnz_b) b_sparse = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab)) @@ -60,7 +59,8 @@ print('', flush=True) nnz_arr[it] = len(C.data) ntop_arr[it] = C_ntop - + del C + del C_ntop # top 5 results per row @@ -69,112 +69,88 @@ rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh)', number=3, globals=globals()) - rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, scout_nnz=True)', - number=3, - globals=globals()) - df.loc[r] = [it, 0, rtv, rtv2, 100.*(rtv2 - rtv)/rtv] + df.loc[r] = [it, 0, rtv] r += 1 - print('sample\t\tpython\t\t+scout', flush=True) - print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + print('sample\t\tpython', flush=True) + print(f'{it}\t\t{rtv:7.4f}', flush=True) print("Threaded function with 1 thread") rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1)', number=3, globals=globals()) - rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1, scout_nnz=True)', - number=3, - globals=globals()) - df.loc[r] = [it, 1, rtv, rtv2, 100.*(rtv2 - rtv)/rtv] + df.loc[r] = [it, 1, rtv] r += 1 - print('sample\t\tpython\t\t+scout', flush=True) - print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + print('sample\t\tpython', flush=True) + print(f'{it}\t\t{rtv:7.4f}', flush=True) print("Threaded function with 2 threads") rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2)', number=3, globals=globals()) - rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2, scout_nnz=True)', - number=3, - globals=globals()) - df.loc[r] = [it, 2, rtv, rtv2, 100.*(rtv2 - rtv)/rtv] + df.loc[r] = [it, 2, rtv] r += 1 - print('sample\t\tpython\t\t+scout', flush=True) - print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + print('sample\t\tpython', flush=True) + print(f'{it}\t\t{rtv:7.4f}', flush=True) print("Threaded function with 3 threads") rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3)', number=3, globals=globals()) - rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3, scout_nnz=True)', - number=3, - globals=globals()) - df.loc[r] = [it, 3, rtv, rtv2, 100.*(rtv2 - rtv)/rtv] + df.loc[r] = [it, 3, rtv] r += 1 - print('sample\t\tpython\t\t+scout', flush=True) - print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + print('sample\t\tpython', flush=True) + print(f'{it}\t\t{rtv:7.4f}', flush=True) print("Threaded function with 4 threads") rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4)', number=3, globals=globals()) - rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4, scout_nnz=True)', - number=3, - globals=globals()) - df.loc[r] = [it, 4, rtv, rtv2, 100.*(rtv2 - rtv)/rtv] + df.loc[r] = [it, 4, rtv] r += 1 - print('sample\t\tpython\t\t+scout', flush=True) - print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + print('sample\t\tpython', flush=True) + print(f'{it}\t\t{rtv:7.4f}', flush=True) print("Threaded function with 5 threads") rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5)', number=3, globals=globals()) - rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5, scout_nnz=True)', - number=3, - globals=globals()) - df.loc[r] = [it, 5, rtv, rtv2, 100.*(rtv2 - rtv)/rtv] + df.loc[r] = [it, 5, rtv] r += 1 - print('sample\t\tpython\t\t+scout', flush=True) - print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + print('sample\t\tpython', flush=True) + print(f'{it}\t\t{rtv:7.4f}', flush=True) print("Threaded function with 6 threads") rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6)', number=3, globals=globals()) - rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6, scout_nnz=True)', - number=3, - globals=globals()) - df.loc[r] = [it, 6, rtv, rtv2, 100.*(rtv2 - rtv)/rtv] + df.loc[r] = [it, 6, rtv] r += 1 - print('sample\t\tpython\t\t+scout', flush=True) - print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + print('sample\t\tpython', flush=True) + print(f'{it}\t\t{rtv:7.4f}', flush=True) print("Threaded function with 7 threads") rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7)', number=3, globals=globals()) - rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7, scout_nnz=True)', - number=3, - globals=globals()) - df.loc[r] = [it, 7, rtv, rtv2, 100.*(rtv2 - rtv)/rtv] + df.loc[r] = [it, 7, rtv] r += 1 - print('sample\t\tpython\t\t+scout', flush=True) - print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True) + print('sample\t\tpython', flush=True) + print(f'{it}\t\t{rtv:7.4f}', flush=True) print('') print(f'nnz(A*B) = {nnz_arr[:(it + 1)].mean()} +/- {nnz_arr[:(it + 1)].std()}') print(f'ntop(A*B) = {ntop_arr[:(it + 1)].mean()} +/- {ntop_arr[:(it + 1)].std()}') print('') df = df.astype({ - 'sample': np.int64, '#threads': np.int64, 'python': np.float64, '+scout': np.float64, '%inc': np.float64}) - results = df.groupby('#threads', as_index=True, sort=True)[['python', '+scout', '%inc']].mean() + 'sample': np.int64, '#threads': np.int64, 'python': np.float64}) + results = df.groupby('#threads', as_index=True, sort=True)[['python']].mean() print(results) print('') diff --git a/sparse_dot_topn/example/comparison3.py b/sparse_dot_topn/example/comparison3.py new file mode 100644 index 00000000..b1b9412c --- /dev/null +++ b/sparse_dot_topn/example/comparison3.py @@ -0,0 +1,61 @@ +""" +This file compare our boosting method with calling scipy+numpy function directly +""" + +from __future__ import print_function +import timeit +import time +import numpy as np +import pandas as pd +from scipy.sparse import load_npz +from sparse_dot_topn import awesome_cossim_topn # noqa: F401 + +df = pd.DataFrame(columns=['sample', '#threads', 'python', '+scout', '%inc']) + +a = load_npz('sparse_matrix_A.npz') +b = load_npz('sparse_matrix_B.npz') + +# tic = time.perf_counter() +# p = np.random.permutation(a.shape[0]) +# a = a[p] +# toc = time.perf_counter() +# print(f'shuffle(A) took {(toc - tic):0.4f} seconds', flush=True) + + +N = b.shape[1] +thresh = 0.8 + +nr_vocab = b.shape[0] +density_A = len(a.data)/(a.shape[0]*a.shape[1]) +density_B = len(b.data)/(b.shape[0]*b.shape[1]) +n_samples = a.shape[0] +n_duplicates = b.shape[1] +nnz_a = len(a.data) +nnz_b = len(b.data) + +print(f'ntop = {N}', flush=True) +print(f'threshold = {thresh}', flush=True) +print(f'density(A) = {density_A}', flush=True) +print(f'density(B) = {density_B}', flush=True) +print(f'nr_vocab = {nr_vocab}', flush=True) +print(f'n_samples = {n_samples}', flush=True) +print(f'n_duplicates = {n_duplicates}', flush=True) +print(f'nnz_A = {nnz_a}', flush=True) +print(f'nnz_B = {nnz_b}', flush=True) +print('', flush=True) + +n_matrix_pairs = 1 +nnz_arr = np.full(n_matrix_pairs, 0) +ntop_arr = np.full(n_matrix_pairs, 0) +r = 0 +it = 0 + +tic = time.perf_counter() +C, C_ntop = awesome_cossim_topn(a, b, N, thresh, use_threads=True, n_jobs = 7, return_best_ntop=True) +toc = time.perf_counter() + +print('scout_nnz=True, use_threads=True, n_jobs = 7') +print(f'nnz(A*B) = {len(C.data)}', flush=True) +print(f'ntop(A*B) = {C_ntop}', flush=True) +print(f'duration(A*B) = {(toc - tic):0.4f}', flush=True) + diff --git a/sparse_dot_topn/sparse_dot_topn.pyx b/sparse_dot_topn/sparse_dot_topn.pyx index 9728c467..974b4ce9 100644 --- a/sparse_dot_topn/sparse_dot_topn.pyx +++ b/sparse_dot_topn/sparse_dot_topn.pyx @@ -20,6 +20,7 @@ # distutils: language = c++ from libcpp.vector cimport vector +from array_wrappers cimport ArrayWrapper_int, ArrayWrapper_double cimport numpy as np import numpy as np @@ -45,7 +46,7 @@ cdef extern from "sparse_dot_topn_source.h": double Cx[] ); - cdef void sparse_dot_topn_extd_source( + cdef int sparse_dot_topn_extd_source( int n_row, int n_col, int Ap[], @@ -59,26 +60,12 @@ cdef extern from "sparse_dot_topn_source.h": int Cp[], int Cj[], double Cx[], + vector[int]* alt_Cj, + vector[double]* alt_Cx, + int nnz_max, int* nminmax ); - cdef void sparse_dot_free_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int ntop, - double lower_bound, - int Cp[], - vector[int]* Cj, - vector[double]* Cx, - int* n_minmax - ); - cdef int sparse_dot_only_nnz_source( int n_row, int n_col, @@ -158,7 +145,7 @@ cpdef sparse_dot_topn_extd( np.ndarray[int, ndim=1] c_indptr, np.ndarray[int, ndim=1] c_indices, np.ndarray[double, ndim=1] c_data, - np.ndarray[int, ndim=1] nminmax, + np.ndarray[int, ndim=1] nminmax ): """ Cython glue function to call sparse_dot_topn_extd C++ @@ -185,6 +172,13 @@ cpdef sparse_dot_topn_extd( nminmax: The maximum number of elements per row of C (assuming ntop = n_col) + Returned output: + c_indices, c_data: CSR expression of matrix C. These will + be returned instead of output by reference + if the preset sizes of c_indices and + c_data are too small to hold all the + results. + N.B. A and B must be CSR format!!! The type of input numpy array must be aligned with types of C++ function arguments! @@ -200,12 +194,26 @@ cpdef sparse_dot_topn_extd( cdef int* Cj = &c_indices[0] cdef double* Cx = &c_data[0] cdef int* n_minmax = &nminmax[0] - - sparse_dot_topn_extd_source( - n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax + + cdef nnz_max = len(c_indices) + + cdef vector[int] vCj; + cdef vector[double] vCx; + + cdef int nnz_max_is_too_small = sparse_dot_topn_extd_source( + n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, &vCj, &vCx, nnz_max, n_minmax ) - return - + + if nnz_max_is_too_small: + + c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0) + c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0) + + return c_indices, c_data + + else: + + return None, None cpdef sparse_dot_only_nnz( int n_row, diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/sparse_dot_topn/sparse_dot_topn_parallel.cpp index 8d8fadc6..2317e1ba 100644 --- a/sparse_dot_topn/sparse_dot_topn_parallel.cpp +++ b/sparse_dot_topn/sparse_dot_topn_parallel.cpp @@ -336,7 +336,7 @@ void inner_sparse_dot_topn_extd( } } -void sparse_dot_topn_extd_parallel( +int sparse_dot_topn_extd_parallel( int n_row, int n_col, int Ap[], @@ -350,6 +350,9 @@ void sparse_dot_topn_extd_parallel( int Cp[], int Cj[], double Cx[], + std::vector* alt_Cj, + std::vector* alt_Cx, + int nnz_max, int *n_minmax, int n_jobs ) @@ -391,6 +394,23 @@ void sparse_dot_topn_extd_parallel( start_points[0] = 0; partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1); + int* Cj_container; + double* Cx_container; + + int total = start_points.back(); + int nnz_max_is_too_small = (nnz_max < total); + + if (nnz_max_is_too_small) { + alt_Cj->resize(total); + alt_Cx->resize(total); + Cj_container = &((*alt_Cj)[0]); + Cx_container = &((*alt_Cx)[0]); + } + else { + Cj_container = Cj; + Cx_container = Cx; + } + Cp[0] = 0; for (int job_nr = 0; job_nr < n_jobs; job_nr++) { @@ -399,8 +419,8 @@ void sparse_dot_topn_extd_parallel( job_ranges[job_nr], Cp, start_points[job_nr], - Cj, - Cx, + Cj_container, + Cx_container, &real_candidates[job_nr], &row_nnz[job_nr] ); @@ -408,6 +428,8 @@ void sparse_dot_topn_extd_parallel( for (int job_nr = 0; job_nr < n_jobs; job_nr++) thread_list[job_nr].join(); + + return nnz_max_is_too_small; } void inner_sparse_nnz_only( diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.h b/sparse_dot_topn/sparse_dot_topn_parallel.h index 0099917e..3aeb11e0 100644 --- a/sparse_dot_topn/sparse_dot_topn_parallel.h +++ b/sparse_dot_topn/sparse_dot_topn_parallel.h @@ -40,7 +40,7 @@ extern void sparse_dot_topn_parallel( int n_jobs ); -extern void sparse_dot_topn_extd_parallel( +extern int sparse_dot_topn_extd_parallel( int n_row, int n_col, int Ap[], @@ -54,6 +54,9 @@ extern void sparse_dot_topn_extd_parallel( int Cp[], int Cj[], double Cx[], + std::vector* alt_Cj, + std::vector* alt_Cx, + int nnz_max, int* n_minmax, int n_jobs ); diff --git a/sparse_dot_topn/sparse_dot_topn_source.cpp b/sparse_dot_topn/sparse_dot_topn_source.cpp index 0cc14e62..c908bbec 100644 --- a/sparse_dot_topn/sparse_dot_topn_source.cpp +++ b/sparse_dot_topn/sparse_dot_topn_source.cpp @@ -151,14 +151,22 @@ void sparse_dot_topn_source( ntop: n top results lower_bound: a threshold that the element of A*B must greater than + nnz_max: the size of the memory allocated for the results Cj and Cx. If + nnz_max is found to be too small during the computation, then the + results will be placed in vectors alt_Cj and alt_Cx instead Output by reference: Cp, Cj, Cx: CSR expression of C matrix n_minmax: The maximum number of elements per row of C (assuming ntop = n_col) + alt_Cj, alt_Cx: CSR expression of C matrix as vectors. These will + contain the output only if nnz_max is found to be too small + Returned output: + nnz_max_is_too_small: int 1 or 0 depending on whether nnz_max was found to be + too small or not respectively N.B. A and B must be CSR format!!! */ -void sparse_dot_topn_extd_source( +int sparse_dot_topn_extd_source( int n_row, int n_col, int Ap[], @@ -172,6 +180,9 @@ void sparse_dot_topn_extd_source( int Cp[], int Cj[], double Cx[], //data of C + std::vector* alt_Cj, + std::vector* alt_Cx, + int nnz_max, int* n_minmax ) { @@ -181,6 +192,7 @@ void sparse_dot_topn_extd_source( std::vector candidates; int nnz = 0; + int nnz_max_is_too_small = 0; Cp[0] = 0; *n_minmax = 0; @@ -234,16 +246,32 @@ void sparse_dot_topn_extd_source( } else { std::sort(candidates.begin(), candidates.end(), candidate_cmp); } - - for(int a=0; a < len; a++){ - Cj[nnz] = candidates[a].index; - Cx[nnz] = candidates[a].value; - nnz++; + if (len + nnz > nnz_max){ + if (!nnz_max_is_too_small){ + nnz_max_is_too_small = true; + alt_Cj->resize(nnz); + alt_Cx->resize(nnz); + std::copy(Cj, Cj + nnz, alt_Cj->data()); + std::copy(Cx, Cx + nnz, alt_Cx->data()); + } + for(int a = 0; a < len; a++){ + alt_Cj->push_back(candidates[a].index); + alt_Cx->push_back(candidates[a].value); + nnz++; + } + } + else { + for(int a = 0; a < len; a++){ + Cj[nnz] = candidates[a].index; + Cx[nnz] = candidates[a].value; + nnz++; + } } candidates.clear(); Cp[i+1] = nnz; } + return nnz_max_is_too_small; } /* diff --git a/sparse_dot_topn/sparse_dot_topn_source.h b/sparse_dot_topn/sparse_dot_topn_source.h index 7975a75b..0ac85127 100644 --- a/sparse_dot_topn/sparse_dot_topn_source.h +++ b/sparse_dot_topn/sparse_dot_topn_source.h @@ -44,7 +44,7 @@ extern void sparse_dot_topn_source( double Cx[] //data of C ); -extern void sparse_dot_topn_extd_source( +extern int sparse_dot_topn_extd_source( int n_row, int n_col, int Ap[], @@ -58,6 +58,9 @@ extern void sparse_dot_topn_extd_source( int Cp[], int Cj[], double Cx[], //data of C + std::vector* alt_Cj, + std::vector* alt_Cx, + int nnz_max, int* n_minmax ); diff --git a/sparse_dot_topn/sparse_dot_topn_threaded.pyx b/sparse_dot_topn/sparse_dot_topn_threaded.pyx index ad95fbb9..e20aaaaf 100644 --- a/sparse_dot_topn/sparse_dot_topn_threaded.pyx +++ b/sparse_dot_topn/sparse_dot_topn_threaded.pyx @@ -20,6 +20,7 @@ # distutils: language = c++ from libcpp.vector cimport vector +from array_wrappers cimport ArrayWrapper_int, ArrayWrapper_double cimport numpy as np import numpy as np @@ -47,7 +48,7 @@ cdef extern from "sparse_dot_topn_parallel.h": int n_jobs ); - cdef void sparse_dot_topn_extd_parallel( + cdef int sparse_dot_topn_extd_parallel( int n_row, int n_col, int Ap[], @@ -61,6 +62,9 @@ cdef extern from "sparse_dot_topn_parallel.h": int Cp[], int Cj[], double Cx[], + vector[int]* alt_Cj, + vector[double]* alt_Cx, + int nnz_max, int* n_minmax, int n_jobs ); @@ -139,11 +143,26 @@ cpdef sparse_dot_topn_extd_threaded( cdef int* Cj = &c_indices[0] cdef double* Cx = &c_data[0] cdef int* n_minmax = &nminmax[0] - - sparse_dot_topn_extd_parallel( - n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax, n_jobs + + cdef nnz_max = len(c_indices) + + cdef vector[int] vCj; + cdef vector[double] vCx; + + cdef int nnz_max_is_too_small = sparse_dot_topn_extd_parallel( + n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, &vCj, &vCx, nnz_max, n_minmax, n_jobs ) - return + + if nnz_max_is_too_small: + + c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0) + c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0) + + return c_indices, c_data + + else: + + return None, None cpdef sparse_dot_only_nnz_threaded( int n_row, diff --git a/sparse_dot_topn/test/test_awesome_cossim_topn.py b/sparse_dot_topn/test/test_awesome_cossim_topn.py index 1a631179..5560ccc3 100644 --- a/sparse_dot_topn/test/test_awesome_cossim_topn.py +++ b/sparse_dot_topn/test/test_awesome_cossim_topn.py @@ -11,7 +11,6 @@ PRUNE_THRESHOLD = 0.1 NUM_CANDIDATES = 3 -SCOUT_NNZ = True USE_THREADS = True MAX_N_PROCESSES = min(8, multiprocessing.cpu_count()) - 1 @@ -38,7 +37,6 @@ def get_n_top_sparse(mat, n_top=10): def helper_awesome_cossim_topn_dense( a_dense, b_dense, - scout_nnz=False, use_threads=False, n_jobs=1 ): @@ -58,7 +56,6 @@ def helper_awesome_cossim_topn_dense( awesome_result = awesome_cossim_topn( a_csr, b_csr_t, len(b_dense), 0.0, - scout_nnz=scout_nnz, use_threads=use_threads, n_jobs=n_jobs ) @@ -67,7 +64,6 @@ def helper_awesome_cossim_topn_dense( b_csr_t, NUM_CANDIDATES, 0.0, - scout_nnz=scout_nnz, use_threads=use_threads, n_jobs=n_jobs ) @@ -79,7 +75,6 @@ def helper_awesome_cossim_topn_dense( b_csr_t, len(b_dense), PRUNE_THRESHOLD, - scout_nnz=scout_nnz, use_threads=use_threads, n_jobs=n_jobs ) @@ -88,7 +83,6 @@ def helper_awesome_cossim_topn_dense( b_csr_t, NUM_CANDIDATES, PRUNE_THRESHOLD, - scout_nnz=scout_nnz, use_threads=use_threads, n_jobs=n_jobs ) @@ -119,7 +113,6 @@ def helper_awesome_cossim_topn_sparse( a_sparse, b_sparse, flag=True, - scout_nnz=False, use_threads=False, n_jobs=1 ): @@ -141,7 +134,6 @@ def helper_awesome_cossim_topn_sparse( b_csr_t, b_sparse.shape[0], 0.0, - scout_nnz=scout_nnz, use_threads=use_threads, n_jobs=n_jobs ) @@ -150,7 +142,6 @@ def helper_awesome_cossim_topn_sparse( b_csr_t, NUM_CANDIDATES, 0.0, - scout_nnz=scout_nnz, use_threads=use_threads, n_jobs=n_jobs ) @@ -162,7 +153,6 @@ def helper_awesome_cossim_topn_sparse( b_csr_t, b_sparse.shape[0], PRUNE_THRESHOLD, - scout_nnz=scout_nnz, use_threads=use_threads, n_jobs=n_jobs ) @@ -171,7 +161,6 @@ def helper_awesome_cossim_topn_sparse( b_csr_t, NUM_CANDIDATES, PRUNE_THRESHOLD, - scout_nnz=scout_nnz, use_threads=use_threads, n_jobs=n_jobs ) @@ -216,17 +205,9 @@ def test_awesome_cossim_topn_manually(): [0.6, 0.1, 0.2, 0.8, 0.1], [0.9, 0.1, 0.6, 0.4, 0.3]] helper_awesome_cossim_topn_dense(a_dense, b_dense) - helper_awesome_cossim_topn_dense(a_dense, b_dense, scout_nnz=SCOUT_NNZ) for process in range(MAX_N_PROCESSES): n_jobs = process + 1 helper_awesome_cossim_topn_dense(a_dense, b_dense, use_threads=USE_THREADS, n_jobs=n_jobs) - helper_awesome_cossim_topn_dense( - a_dense, - b_dense, - scout_nnz=SCOUT_NNZ, - use_threads=USE_THREADS, - n_jobs=n_jobs - ) # boundary checking, there is no matching at all in this case c_dense = [[0.2, 0.1, 0.3, 0, 0], @@ -240,17 +221,9 @@ def test_awesome_cossim_topn_manually(): [0, 0, 0, 0.1, 0.3], [0, 0, 0, 0.7, 0.5]] helper_awesome_cossim_topn_dense(c_dense, d_dense) - helper_awesome_cossim_topn_dense(c_dense, d_dense, scout_nnz=SCOUT_NNZ) for process in range(MAX_N_PROCESSES): n_jobs = process + 1 helper_awesome_cossim_topn_dense(c_dense, d_dense, use_threads=USE_THREADS, n_jobs=n_jobs) - helper_awesome_cossim_topn_dense( - c_dense, - d_dense, - scout_nnz=SCOUT_NNZ, - use_threads=USE_THREADS, - n_jobs=n_jobs - ) @pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero") @@ -264,17 +237,9 @@ def test_awesome_cossim_top_one_zeros(): a_sparse = csr_matrix(np.zeros((1, nr_vocab))) b_sparse = rand(800, nr_vocab, density=density, format='csr') helper_awesome_cossim_topn_sparse(a_sparse, b_sparse) - helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, scout_nnz=SCOUT_NNZ) for process in range(MAX_N_PROCESSES): n_jobs = process + 1 helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, use_threads=USE_THREADS, n_jobs=n_jobs) - helper_awesome_cossim_topn_sparse( - a_sparse, - b_sparse, - scout_nnz=SCOUT_NNZ, - use_threads=USE_THREADS, - n_jobs=n_jobs - ) @pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero") @@ -288,17 +253,9 @@ def test_awesome_cossim_top_all_zeros(): a_sparse = csr_matrix(np.zeros((2, nr_vocab))) b_sparse = rand(800, nr_vocab, density=density, format='csr') helper_awesome_cossim_topn_sparse(a_sparse, b_sparse) - helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, scout_nnz=SCOUT_NNZ) for process in range(MAX_N_PROCESSES): n_jobs = process + 1 helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, use_threads=USE_THREADS, n_jobs=n_jobs) - helper_awesome_cossim_topn_sparse( - a_sparse, - b_sparse, - scout_nnz=SCOUT_NNZ, - use_threads=USE_THREADS, - n_jobs=n_jobs - ) @pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero") @@ -311,18 +268,9 @@ def test_awesome_cossim_top_small_matrix(): a_sparse = rand(300, nr_vocab, density=density, format='csr') b_sparse = rand(800, nr_vocab, density=density, format='csr') helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False) - helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, scout_nnz=SCOUT_NNZ) for process in range(MAX_N_PROCESSES): n_jobs = process + 1 helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, use_threads=USE_THREADS, n_jobs=n_jobs) - helper_awesome_cossim_topn_sparse( - a_sparse, - b_sparse, - False, - scout_nnz=SCOUT_NNZ, - use_threads=USE_THREADS, - n_jobs=n_jobs - ) @pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero") @@ -360,15 +308,6 @@ def test_awesome_cossim_top_large_matrix(): b_sparse = b_sparse.tocsr() helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False) - helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, scout_nnz=SCOUT_NNZ) for process in range(MAX_N_PROCESSES): n_jobs = process + 1 helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, use_threads=USE_THREADS, n_jobs=n_jobs) - helper_awesome_cossim_topn_sparse( - a_sparse, - b_sparse, - False, - scout_nnz=SCOUT_NNZ, - use_threads=USE_THREADS, - n_jobs=n_jobs - ) From 0b3bc8a5cd9990a947e5d75b0b4a550bc42d74f4 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Mon, 3 May 2021 11:25:42 +0200 Subject: [PATCH 16/29] tried vector reserve --- sparse_dot_topn/example/comparison3.py | 2 - sparse_dot_topn/sparse_dot_topn_parallel.cpp | 64 +++++++++++++++----- sparse_dot_topn/sparse_dot_topn_source.cpp | 1 + 3 files changed, 50 insertions(+), 17 deletions(-) diff --git a/sparse_dot_topn/example/comparison3.py b/sparse_dot_topn/example/comparison3.py index b1b9412c..c0bcf145 100644 --- a/sparse_dot_topn/example/comparison3.py +++ b/sparse_dot_topn/example/comparison3.py @@ -10,8 +10,6 @@ from scipy.sparse import load_npz from sparse_dot_topn import awesome_cossim_topn # noqa: F401 -df = pd.DataFrame(columns=['sample', '#threads', 'python', '+scout', '%inc']) - a = load_npz('sparse_matrix_A.npz') b = load_npz('sparse_matrix_B.npz') diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/sparse_dot_topn/sparse_dot_topn_parallel.cpp index 2317e1ba..522f9e72 100644 --- a/sparse_dot_topn/sparse_dot_topn_parallel.cpp +++ b/sparse_dot_topn/sparse_dot_topn_parallel.cpp @@ -51,7 +51,37 @@ void distribute_load( } } -void inner_gather_function( +void inner_gather_v2( + job_range_type job_range, + int Cp[], + int Cp_start, + int Cj[], + double Cx[], + std::vector* real_candidates, + std::vector* row_nnz +) +{ + if (job_range.begin >= job_range.end) return; + + int* nnz_begin = row_nnz->data(); + int* nnz_end = nnz_begin + row_nnz->size(); + + int* Cp_begin = &Cp[job_range.begin + 1]; + + (*row_nnz)[0] += Cp_start; + std::partial_sum(nnz_begin, nnz_end, Cp_begin); + + candidate* c_begin = real_candidates->data(); + candidate* c_end = c_begin + real_candidates->size(); + + int* Cj_begin = &Cj[Cp_start]; + double* Cx_begin = &Cx[Cp_start]; + + std::transform(c_begin, c_end, Cj_begin, [](candidate c) -> int { return c.index; }); + std::transform(c_begin, c_end, Cx_begin, [](candidate c) -> double { return c.value; }); +} + +void inner_gather_v1( job_range_type job_range, int Cp[], int Cp_start, @@ -217,18 +247,18 @@ void sparse_dot_topn_parallel( thread_list[job_nr].join(); // gather the results: - std::vector start_points(n_jobs + 1); - start_points[0] = 0; - partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1); + std::vector nnz_job_starts(n_jobs + 1); + nnz_job_starts[0] = 0; + partial_sum(sub_total.begin(), sub_total.end(), nnz_job_starts.begin() + 1); Cp[0] = 0; for (int job_nr = 0; job_nr < n_jobs; job_nr++) { thread_list[job_nr] = std::thread( - inner_gather_function, + inner_gather_v1, job_ranges[job_nr], Cp, - start_points[job_nr], + nnz_job_starts[job_nr], Cj, Cx, &real_candidates[job_nr], @@ -254,13 +284,14 @@ void inner_sparse_dot_topn_extd( std::vector* real_candidates, std::vector* row_nnz, int* total, - int* n_minmax + int* n_minmax, + int mem_sz_per_row ) { std::vector next(n_col_inner,-1); std::vector sums(n_col_inner, 0); - real_candidates->reserve(job_range.end - job_range.begin); + real_candidates->reserve(mem_sz_per_row*(job_range.end - job_range.begin)); row_nnz->resize(job_range.end - job_range.begin); int* row_nnz_ptr = row_nnz->data(); @@ -367,6 +398,8 @@ int sparse_dot_topn_extd_parallel( std::vector sub_total(n_jobs, 0); std::vector split_n_minmax(n_jobs, 0); + int mem_sz_per_row = std::max(1, (int) ceil(((double) nnz_max)/((double) n_row))); + std::vector thread_list(n_jobs); for (int job_nr = 0; job_nr < n_jobs; job_nr++) { @@ -380,7 +413,8 @@ int sparse_dot_topn_extd_parallel( &real_candidates[job_nr], &row_nnz[job_nr], &sub_total[job_nr], - &split_n_minmax[job_nr] + &split_n_minmax[job_nr], + mem_sz_per_row ); } @@ -390,14 +424,14 @@ int sparse_dot_topn_extd_parallel( // gather the results: *n_minmax = *max_element(split_n_minmax.begin(), split_n_minmax.end()); - std::vector start_points(n_jobs + 1); - start_points[0] = 0; - partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1); + std::vector nnz_job_starts(n_jobs + 1); + nnz_job_starts[0] = 0; + partial_sum(sub_total.begin(), sub_total.end(), nnz_job_starts.begin() + 1); int* Cj_container; double* Cx_container; - int total = start_points.back(); + int total = nnz_job_starts.back(); int nnz_max_is_too_small = (nnz_max < total); if (nnz_max_is_too_small) { @@ -415,10 +449,10 @@ int sparse_dot_topn_extd_parallel( for (int job_nr = 0; job_nr < n_jobs; job_nr++) { thread_list[job_nr] = std::thread( - inner_gather_function, + inner_gather_v1, job_ranges[job_nr], Cp, - start_points[job_nr], + nnz_job_starts[job_nr], Cj_container, Cx_container, &real_candidates[job_nr], diff --git a/sparse_dot_topn/sparse_dot_topn_source.cpp b/sparse_dot_topn/sparse_dot_topn_source.cpp index c908bbec..be987495 100644 --- a/sparse_dot_topn/sparse_dot_topn_source.cpp +++ b/sparse_dot_topn/sparse_dot_topn_source.cpp @@ -190,6 +190,7 @@ int sparse_dot_topn_extd_source( std::vector sums(n_col, 0); std::vector candidates; + candidates.reserve(n_col); int nnz = 0; int nnz_max_is_too_small = 0; From 80d388bb1f1f08b1624737524a44c2e406f2586d Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Tue, 4 May 2021 23:03:18 +0200 Subject: [PATCH 17/29] fixed bug related to single-valued input Series --- .github/workflows/test.yml | 28 ++++++++++++++++++++++++++++ string_grouper/string_grouper.py | 18 ++++++++++-------- 2 files changed, 38 insertions(+), 8 deletions(-) create mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 00000000..17dcc3ee --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,28 @@ +name: Run tests +on: + pull_request: + push: + branches: + - master + +jobs: + test: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.7, 3.8, 3.9] + os: [ubuntu-latest, windows-latest] + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install package + run: pip install . + + - name: Run tests + run: python -m unittest diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index 61903d5f..2be98158 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -255,7 +255,7 @@ def fit(self) -> 'StringGrouper': matches, self._true_max_n_matches = self._build_matches(master_matrix, duplicate_matrix) if self._duplicates is None and self._max_n_matches < self._true_max_n_matches: # the list of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A) - matches = StringGrouper._symmetrize_matrix(matches) + matches = StringGrouper._symmetrize_matrix_and_fix_diagonal(matches) # build list from matrix self._matches_list = self._get_matches_list(matches) self.is_build = True @@ -532,11 +532,10 @@ def _get_nearest_matches(self, dupes_max_sim = dupes_max_sim.sort_values('dupe_side').set_index('dupe_side') output = dupes_max_sim[index_column_list + required_column_list] output.index = self._duplicates.index - return output.squeeze() + return output.squeeze(axis=1) def _deduplicate(self, ignore_index=False) -> Union[pd.DataFrame, pd.Series]: - # discard self-matches: A matches A - pairs = self._matches_list[self._matches_list['master_side'] != self._matches_list['dupe_side']] + pairs = self._matches_list # rebuild graph adjacency matrix from already found matches: n = len(self._master) graph = csr_matrix( @@ -564,7 +563,7 @@ def _deduplicate(self, ignore_index=False) -> Union[pd.DataFrame, pd.Series]: graph.data = pairs['similarity'].to_numpy() # sum along the rows to obtain numpy 1D matrix of similarity aggregates then ... # ... convert to 1D numpy array (using asarray then squeeze) and then to Series: - group_of_master_index['weight'] = pd.Series(np.asarray(graph.sum(axis=1)).squeeze()) + group_of_master_index['weight'] = pd.Series(np.asarray(graph.sum(axis=1)).squeeze(axis=1)) method = 'idxmax' # Determine the group representatives AND merge with indices: @@ -588,7 +587,7 @@ def _deduplicate(self, ignore_index=False) -> Union[pd.DataFrame, pd.Series]: output_id = self._master_id.iloc[group_of_master_index.group_rep].rename(id_label).reset_index(drop=True) output = pd.concat([output_id, output], axis=1) output.index = self._master.index - return output.squeeze() + return output def _get_indices_of(self, master_side: str, dupe_side: str) -> Tuple[pd.Series, pd.Series]: master_strings = self._master @@ -617,19 +616,22 @@ def _validate_replace_na_and_drop(self): ) @staticmethod - def _symmetrize_matrix(AA: csr_matrix) -> csr_matrix: + def _symmetrize_matrix_and_fix_diagonal(AA: csr_matrix) -> csr_matrix: A = AA.tolil() r, c = A.nonzero() A[c, r] = A[r, c] + r = np.arange(A.shape[0]) + A[r, r] = 1 return A.tocsr() @staticmethod def _get_matches_list(matches: csr_matrix) -> pd.DataFrame: """Returns a list of all the indices of matches""" r, c = matches.nonzero() - return pd.DataFrame({'master_side': r.astype(np.int64), + matches_list = pd.DataFrame({'master_side': r.astype(np.int64), 'dupe_side': c.astype(np.int64), 'similarity': matches.data}) + return matches_list @staticmethod def _make_symmetric(new_matches: pd.DataFrame) -> pd.DataFrame: From 2c6b102d3883f93c3e45272f99e525996425bc85 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Wed, 5 May 2021 00:49:25 +0200 Subject: [PATCH 18/29] fixed bug related to single-valued input Series --- string_grouper/string_grouper.py | 29 ++++++++++++++++------ string_grouper/test/test_string_grouper.py | 29 ++++++++++++++++++---- 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index 2be98158..d0b1844c 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -251,11 +251,21 @@ def n_grams(self, string: str) -> List[str]: def fit(self) -> 'StringGrouper': """Builds the _matches list which contains string matches indices and similarity""" master_matrix, duplicate_matrix = self._get_tf_idf_matrices() + # Calculate the matches using the cosine similarity matches, self._true_max_n_matches = self._build_matches(master_matrix, duplicate_matrix) - if self._duplicates is None and self._max_n_matches < self._true_max_n_matches: - # the list of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A) - matches = StringGrouper._symmetrize_matrix_and_fix_diagonal(matches) + + if self._duplicates is None: + # convert to lil format for best efficiency when setting matrix-elements + matches = matches.tolil() + # matrix diagonal elements must be exactly 1 (numerical precision errors introduced by + # floating-point computations in awesome_cossim_topn sometimes lead to unexpected results) + matches = StringGrouper._fix_diagonal(matches) + if self._max_n_matches < self._true_max_n_matches: + # the list of matches must be symmetric! (i.e., if A != B and A matches B; then B matches A) + matches = StringGrouper._symmetrize_matrix(matches) + matches = matches.tocsr() + # build list from matrix self._matches_list = self._get_matches_list(matches) self.is_build = True @@ -616,13 +626,16 @@ def _validate_replace_na_and_drop(self): ) @staticmethod - def _symmetrize_matrix_and_fix_diagonal(AA: csr_matrix) -> csr_matrix: - A = AA.tolil() - r, c = A.nonzero() - A[c, r] = A[r, c] + def _fix_diagonal(A) -> csr_matrix: r = np.arange(A.shape[0]) A[r, r] = 1 - return A.tocsr() + return A + + @staticmethod + def _symmetrize_matrix(A) -> csr_matrix: + r, c = A.nonzero() + A[c, r] = A[r, c] + return A @staticmethod def _get_matches_list(matches: csr_matrix) -> pd.DataFrame: diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py index d5c1dd0b..383f4b11 100644 --- a/string_grouper/test/test_string_grouper.py +++ b/string_grouper/test/test_string_grouper.py @@ -197,7 +197,10 @@ def test_match_strings(self, mock_StringGouper): mock_StringGrouper_instance.get_matches.assert_called_once() self.assertEqual(df, 'whatever') - @patch('string_grouper.string_grouper.StringGrouper._symmetrize_matrix', side_effect=mock_symmetrize_matrix) + @patch( + 'string_grouper.string_grouper.StringGrouper._symmetrize_matrix', + side_effect=mock_symmetrize_matrix + ) def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_matrix): """mocks StringGrouper._symmetrize_matches_list so that this test fails whenever _matches_list is **partially** symmetric which often occurs when the kwarg max_n_matches is too small""" @@ -236,17 +239,33 @@ def test_match_list_symmetry_with_symmetrize_function(self): # upper, upper_prime and their intersection should be identical. self.assertTrue(intersection.empty or len(upper) == len(upper_prime) == len(intersection)) - def test_match_list_diagonal(self): + @patch( + 'string_grouper.string_grouper.StringGrouper._fix_diagonal', + side_effect=mock_symmetrize_matrix + ) + def test_match_list_diagonal_without_the_fix(self, mock_fix_diagonal): """test fails whenever _matches_list's number of self-joins is not equal to the number of strings""" # This bug is difficult to reproduce -- I mostly encounter it while working with very large datasets; # for small datasets setting max_n_matches=1 reproduces the bug simple_example = SimpleExample() df = simple_example.customers_df['Customer Name'] matches = match_strings(df, max_n_matches=1) + mock_fix_diagonal.assert_called_once() num_self_joins = len(matches[matches['left_index'] == matches['right_index']]) num_strings = len(df) self.assertNotEqual(num_self_joins, num_strings) + def test_match_list_diagonal(self): + """This test ensures that all self-joins are present""" + # This bug is difficult to reproduce -- I mostly encounter it while working with very large datasets; + # for small datasets setting max_n_matches=1 reproduces the bug + simple_example = SimpleExample() + df = simple_example.customers_df['Customer Name'] + matches = match_strings(df, max_n_matches=1) + num_self_joins = len(matches[matches['left_index'] == matches['right_index']]) + num_strings = len(df) + self.assertEqual(num_self_joins, num_strings) + def test_zero_min_similarity(self): """Since sparse matrices exclude zero elements, this test ensures that zero similarity matches are returned when min_similarity <= 0. A bug related to this was first pointed out by @nbcvijanovic""" @@ -381,7 +400,7 @@ def test_get_matches_single(self): left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] left_index = [0, 0, 1, 2, 3, 3] - right_index = [3, 0, 1, 2, 3, 0] + right_index = [0, 3, 1, 2, 0, 3] similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'similarity': similarity, @@ -397,8 +416,8 @@ def test_get_matches_1_series_1_id_series(self): left_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3'] left_index = [0, 0, 1, 2, 3, 3] right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] - right_side_id = ['A3', 'A0', 'A1', 'A2', 'A3', 'A0'] - right_index = [3, 0, 1, 2, 3, 0] + right_side_id = ['A0', 'A3', 'A1', 'A2', 'A0', 'A3'] + right_index = [0, 3, 1, 2, 0, 3] similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id, 'similarity': similarity, From 1b8ddecf48bb7eaecdacc3f89fd7b84eadef1321 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Wed, 5 May 2021 01:06:23 +0200 Subject: [PATCH 19/29] modified GitHub workflow action script test.yml --- .github/workflows/test.yml | 8 ++++++-- setup.py | 3 +-- sparse_dot_topn/sparse_dot_topn_parallel.cpp | 3 ++- sparse_dot_topn/test/test_awesome_cossim_topn.py | 2 +- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 17dcc3ee..93336b1e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -22,7 +22,11 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install package - run: pip install . + run: | + python -m pip install --upgrade pip + pip install -e . - name: Run tests - run: python -m unittest + run: | + pip install pytest + pytest -ra --capture=no --showlocals diff --git a/setup.py b/setup.py index cf5d5fee..8c51b072 100644 --- a/setup.py +++ b/setup.py @@ -31,8 +31,7 @@ def finalize_options(self): array_wrappers_ext = Extension('sparse_dot_topn.array_wrappers', sources=[ - './sparse_dot_topn/array_wrappers.pyx', - './sparse_dot_topn/sparse_dot_topn_source.cpp' + './sparse_dot_topn/array_wrappers.pyx' ], extra_compile_args=extra_compile_args, language='c++') diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/sparse_dot_topn/sparse_dot_topn_parallel.cpp index 522f9e72..0efb7a45 100644 --- a/sparse_dot_topn/sparse_dot_topn_parallel.cpp +++ b/sparse_dot_topn/sparse_dot_topn_parallel.cpp @@ -20,6 +20,7 @@ // Modified by: Particular Miner // April 14, 2021 +#include #include #include #include @@ -398,7 +399,7 @@ int sparse_dot_topn_extd_parallel( std::vector sub_total(n_jobs, 0); std::vector split_n_minmax(n_jobs, 0); - int mem_sz_per_row = std::max(1, (int) ceil(((double) nnz_max)/((double) n_row))); + int mem_sz_per_row = std::max(1, (int) std::ceil(((double) nnz_max)/((double) n_row))); std::vector thread_list(n_jobs); diff --git a/sparse_dot_topn/test/test_awesome_cossim_topn.py b/sparse_dot_topn/test/test_awesome_cossim_topn.py index 5560ccc3..a9734668 100644 --- a/sparse_dot_topn/test/test_awesome_cossim_topn.py +++ b/sparse_dot_topn/test/test_awesome_cossim_topn.py @@ -280,7 +280,7 @@ def test_awesome_cossim_top_large_matrix(): # test with large matrix nr_vocab = 2 << 24 density = 1e-6 - n_samples = 10000 + n_samples = 1000 nnz = int(n_samples * nr_vocab * density) rng1 = np.random.RandomState(42) From 75fdf3d2006cff5e8a9a8e327a1e918c17b091ce Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Wed, 5 May 2021 11:01:40 +0200 Subject: [PATCH 20/29] renamed sparse_dot_topn sub-package to string_grouper_topn to avoid possible conflicts with original pypi package sparse_dot_topn --- .github/workflows/test.yml | 4 +- setup.py | 44 +++++++++---------- string_grouper/string_grouper.py | 2 +- .../__init__.py | 2 +- .../array_wrappers.pxd | 0 .../array_wrappers.pyx | 0 .../awesome_cossim_topn.py | 5 +-- .../example/comparison.py | 2 +- .../example/comparison2.py | 2 +- .../example/comparison3.py | 2 +- .../example/example.py | 2 +- .../sparse_dot_topn.pyx | 0 .../sparse_dot_topn_parallel.cpp | 0 .../sparse_dot_topn_parallel.h | 0 .../sparse_dot_topn_source.cpp | 0 .../sparse_dot_topn_source.h | 0 .../sparse_dot_topn_threaded.pyx | 0 .../test/test_awesome_cossim_topn.py | 2 +- string_grouper_utils/string_grouper_utils.py | 6 +-- 19 files changed, 36 insertions(+), 37 deletions(-) rename {sparse_dot_topn => string_grouper_topn}/__init__.py (60%) rename {sparse_dot_topn => string_grouper_topn}/array_wrappers.pxd (100%) rename {sparse_dot_topn => string_grouper_topn}/array_wrappers.pyx (100%) rename {sparse_dot_topn => string_grouper_topn}/awesome_cossim_topn.py (96%) rename {sparse_dot_topn => string_grouper_topn}/example/comparison.py (98%) rename {sparse_dot_topn => string_grouper_topn}/example/comparison2.py (98%) rename {sparse_dot_topn => string_grouper_topn}/example/comparison3.py (96%) rename {sparse_dot_topn => string_grouper_topn}/example/example.py (86%) rename {sparse_dot_topn => string_grouper_topn}/sparse_dot_topn.pyx (100%) rename {sparse_dot_topn => string_grouper_topn}/sparse_dot_topn_parallel.cpp (100%) rename {sparse_dot_topn => string_grouper_topn}/sparse_dot_topn_parallel.h (100%) rename {sparse_dot_topn => string_grouper_topn}/sparse_dot_topn_source.cpp (100%) rename {sparse_dot_topn => string_grouper_topn}/sparse_dot_topn_source.h (100%) rename {sparse_dot_topn => string_grouper_topn}/sparse_dot_topn_threaded.pyx (100%) rename {sparse_dot_topn => string_grouper_topn}/test/test_awesome_cossim_topn.py (99%) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 93336b1e..5317a62d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -21,10 +21,10 @@ jobs: with: python-version: ${{ matrix.python-version }} - - name: Install package + - name: Install dev-package run: | python -m pip install --upgrade pip - pip install -e . + pip install -v -e . - name: Run tests run: | diff --git a/setup.py b/setup.py index 8c51b072..cf333180 100644 --- a/setup.py +++ b/setup.py @@ -29,27 +29,27 @@ def finalize_options(self): else: extra_compile_args = ['-std=c++0x', '-pthread', '-O3'] -array_wrappers_ext = Extension('sparse_dot_topn.array_wrappers', +array_wrappers_ext = Extension('string_grouper_topn.array_wrappers', sources=[ - './sparse_dot_topn/array_wrappers.pyx' + './string_grouper_topn/array_wrappers.pyx', ], extra_compile_args=extra_compile_args, language='c++') -original_ext = Extension('sparse_dot_topn.sparse_dot_topn', +original_ext = Extension('string_grouper_topn.sparse_dot_topn', sources=[ - './sparse_dot_topn/sparse_dot_topn.pyx', - './sparse_dot_topn/sparse_dot_topn_source.cpp' + './string_grouper_topn/sparse_dot_topn.pyx', + './string_grouper_topn/sparse_dot_topn_source.cpp', ], extra_compile_args=extra_compile_args, define_macros=[('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')], language='c++') -threaded_ext = Extension('sparse_dot_topn.sparse_dot_topn_threaded', +threaded_ext = Extension('string_grouper_topn.sparse_dot_topn_threaded', sources=[ - './sparse_dot_topn/sparse_dot_topn_threaded.pyx', - './sparse_dot_topn/sparse_dot_topn_source.cpp', - './sparse_dot_topn/sparse_dot_topn_parallel.cpp' + './string_grouper_topn/sparse_dot_topn_threaded.pyx', + './string_grouper_topn/sparse_dot_topn_source.cpp', + './string_grouper_topn/sparse_dot_topn_parallel.cpp', ], extra_compile_args=extra_compile_args, define_macros=[('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')], @@ -59,9 +59,9 @@ def finalize_options(self): name='string_grouper', version='0.4.0', packages=[ - 'string_grouper' - , 'string_grouper_utils' - , 'sparse_dot_topn' + 'string_grouper_topn', + 'string_grouper', + 'string_grouper_utils', ], license='MIT License', description='String grouper contains functions to do string matching using TF-IDF and the cossine similarity. ' @@ -75,18 +75,18 @@ def finalize_options(self): zip_safe=False, python_requires='>3.7', setup_requires=[# Setuptools 18.0 properly handles Cython extensions. - 'setuptools>=18.0' - , 'cython>=0.29.15' - , 'numpy' - , 'scipy' + 'setuptools>=18.0', + 'cython>=0.29.15', + 'numpy', + 'scipy', ], install_requires=[# Setuptools 18.0 properly handles Cython extensions. - 'setuptools>=18.0' - , 'cython>=0.29.15' - , 'numpy' - , 'scipy' - , 'scikit-learn' - , 'pandas>=0.25.3' + 'setuptools>=18.0', + 'cython>=0.29.15', + 'numpy', + 'scipy', + 'scikit-learn', + 'pandas>=0.25.3', ], cmdclass={'build_ext': my_build_ext}, ext_modules=[array_wrappers_ext, original_ext, threaded_ext], diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index d0b1844c..a2991475 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -6,7 +6,7 @@ from scipy.sparse.csr import csr_matrix from scipy.sparse.csgraph import connected_components from typing import Tuple, NamedTuple, List, Optional, Union -from sparse_dot_topn import awesome_cossim_topn +from string_grouper_topn import awesome_cossim_topn from functools import wraps DEFAULT_NGRAM_SIZE: int = 3 diff --git a/sparse_dot_topn/__init__.py b/string_grouper_topn/__init__.py similarity index 60% rename from sparse_dot_topn/__init__.py rename to string_grouper_topn/__init__.py index cbaf32a7..b123439e 100644 --- a/sparse_dot_topn/__init__.py +++ b/string_grouper_topn/__init__.py @@ -2,6 +2,6 @@ import sys if sys.version_info[0] >= 3: - from sparse_dot_topn.awesome_cossim_topn import awesome_cossim_topn + from string_grouper_topn.awesome_cossim_topn import awesome_cossim_topn else: from awesome_cossim_topn import awesome_cossim_topn \ No newline at end of file diff --git a/sparse_dot_topn/array_wrappers.pxd b/string_grouper_topn/array_wrappers.pxd similarity index 100% rename from sparse_dot_topn/array_wrappers.pxd rename to string_grouper_topn/array_wrappers.pxd diff --git a/sparse_dot_topn/array_wrappers.pyx b/string_grouper_topn/array_wrappers.pyx similarity index 100% rename from sparse_dot_topn/array_wrappers.pyx rename to string_grouper_topn/array_wrappers.pyx diff --git a/sparse_dot_topn/awesome_cossim_topn.py b/string_grouper_topn/awesome_cossim_topn.py similarity index 96% rename from sparse_dot_topn/awesome_cossim_topn.py rename to string_grouper_topn/awesome_cossim_topn.py index 380c6e6e..4f90ae63 100644 --- a/sparse_dot_topn/awesome_cossim_topn.py +++ b/string_grouper_topn/awesome_cossim_topn.py @@ -2,12 +2,11 @@ import numpy as np from scipy.sparse import csr_matrix from scipy.sparse import isspmatrix_csr -from _ast import Continue from numpy import indices if sys.version_info[0] >= 3: - from sparse_dot_topn import sparse_dot_topn as ct - from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread + from string_grouper_topn import sparse_dot_topn as ct + from string_grouper_topn import sparse_dot_topn_threaded as ct_thread else: import sparse_dot_topn as ct import sparse_dot_topn_threaded as ct_thread diff --git a/sparse_dot_topn/example/comparison.py b/string_grouper_topn/example/comparison.py similarity index 98% rename from sparse_dot_topn/example/comparison.py rename to string_grouper_topn/example/comparison.py index 7ee673ca..ce3cc0ad 100644 --- a/sparse_dot_topn/example/comparison.py +++ b/string_grouper_topn/example/comparison.py @@ -6,7 +6,7 @@ import timeit import numpy as np from scipy.sparse import coo_matrix -from sparse_dot_topn import awesome_cossim_topn # noqa: F401 +from string_grouper_topn import awesome_cossim_topn # noqa: F401 N = 1000 thresh = 0.01 diff --git a/sparse_dot_topn/example/comparison2.py b/string_grouper_topn/example/comparison2.py similarity index 98% rename from sparse_dot_topn/example/comparison2.py rename to string_grouper_topn/example/comparison2.py index c79cb45f..5cc631f1 100644 --- a/sparse_dot_topn/example/comparison2.py +++ b/string_grouper_topn/example/comparison2.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd from scipy.sparse import coo_matrix -from sparse_dot_topn import awesome_cossim_topn # noqa: F401 +from string_grouper_topn import awesome_cossim_topn # noqa: F401 df = pd.DataFrame(columns=['sample', '#threads', 'python']) diff --git a/sparse_dot_topn/example/comparison3.py b/string_grouper_topn/example/comparison3.py similarity index 96% rename from sparse_dot_topn/example/comparison3.py rename to string_grouper_topn/example/comparison3.py index c0bcf145..74983dde 100644 --- a/sparse_dot_topn/example/comparison3.py +++ b/string_grouper_topn/example/comparison3.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd from scipy.sparse import load_npz -from sparse_dot_topn import awesome_cossim_topn # noqa: F401 +from string_grouper_topn import awesome_cossim_topn # noqa: F401 a = load_npz('sparse_matrix_A.npz') b = load_npz('sparse_matrix_B.npz') diff --git a/sparse_dot_topn/example/example.py b/string_grouper_topn/example/example.py similarity index 86% rename from sparse_dot_topn/example/example.py rename to string_grouper_topn/example/example.py index a61951fd..a403d3ab 100644 --- a/sparse_dot_topn/example/example.py +++ b/string_grouper_topn/example/example.py @@ -1,5 +1,5 @@ from scipy.sparse import rand -from sparse_dot_topn import awesome_cossim_topn +from string_grouper_topn import awesome_cossim_topn N = 10 a = rand(100, 1000000, density=0.005, format='csr') diff --git a/sparse_dot_topn/sparse_dot_topn.pyx b/string_grouper_topn/sparse_dot_topn.pyx similarity index 100% rename from sparse_dot_topn/sparse_dot_topn.pyx rename to string_grouper_topn/sparse_dot_topn.pyx diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/string_grouper_topn/sparse_dot_topn_parallel.cpp similarity index 100% rename from sparse_dot_topn/sparse_dot_topn_parallel.cpp rename to string_grouper_topn/sparse_dot_topn_parallel.cpp diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.h b/string_grouper_topn/sparse_dot_topn_parallel.h similarity index 100% rename from sparse_dot_topn/sparse_dot_topn_parallel.h rename to string_grouper_topn/sparse_dot_topn_parallel.h diff --git a/sparse_dot_topn/sparse_dot_topn_source.cpp b/string_grouper_topn/sparse_dot_topn_source.cpp similarity index 100% rename from sparse_dot_topn/sparse_dot_topn_source.cpp rename to string_grouper_topn/sparse_dot_topn_source.cpp diff --git a/sparse_dot_topn/sparse_dot_topn_source.h b/string_grouper_topn/sparse_dot_topn_source.h similarity index 100% rename from sparse_dot_topn/sparse_dot_topn_source.h rename to string_grouper_topn/sparse_dot_topn_source.h diff --git a/sparse_dot_topn/sparse_dot_topn_threaded.pyx b/string_grouper_topn/sparse_dot_topn_threaded.pyx similarity index 100% rename from sparse_dot_topn/sparse_dot_topn_threaded.pyx rename to string_grouper_topn/sparse_dot_topn_threaded.pyx diff --git a/sparse_dot_topn/test/test_awesome_cossim_topn.py b/string_grouper_topn/test/test_awesome_cossim_topn.py similarity index 99% rename from sparse_dot_topn/test/test_awesome_cossim_topn.py rename to string_grouper_topn/test/test_awesome_cossim_topn.py index a9734668..80a71431 100644 --- a/sparse_dot_topn/test/test_awesome_cossim_topn.py +++ b/string_grouper_topn/test/test_awesome_cossim_topn.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from sparse_dot_topn import awesome_cossim_topn +from string_grouper_topn import awesome_cossim_topn from scipy.sparse.csr import csr_matrix from scipy.sparse import coo_matrix from scipy.sparse import rand diff --git a/string_grouper_utils/string_grouper_utils.py b/string_grouper_utils/string_grouper_utils.py index 11803a32..a570b377 100644 --- a/string_grouper_utils/string_grouper_utils.py +++ b/string_grouper_utils/string_grouper_utils.py @@ -1,7 +1,7 @@ -import numpy as np import pandas as pd from typing import List, Optional, Union from dateutil.parser import parse +from dateutil.tz import UTC from numbers import Number from datetime import datetime import re @@ -143,13 +143,13 @@ def parse_timestamps(timestamps: pd.Series, parserinfo=None, **kwargs) -> pd.Ser # if any of the strings is not datetime-like raise an exception if timestamps.to_frame().applymap(is_date).squeeze().all(): # convert strings to numpy datetime64 - return timestamps.transform(lambda x: np.datetime64(parse(x, parserinfo, **kwargs))) + return timestamps.transform(lambda x: parse(x, parserinfo, **kwargs).astimezone(UTC)) elif is_series_of_type(type(pd.Timestamp('15-1-2000')), timestamps): # convert pandas Timestamps to numpy datetime64 return timestamps.transform(lambda x: x.to_numpy()) elif is_series_of_type(datetime, timestamps): # convert python datetimes to numpy datetime64 - return timestamps.transform(lambda x: np.datetime64(x)) + return timestamps.transform(lambda x: x.astimezone(UTC)) elif is_series_of_type(Number, timestamps): return timestamps raise Exception(error_msg) From 29dcb4204a186e628f7269fe1329ff9149fb492b Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Wed, 5 May 2021 14:49:33 +0200 Subject: [PATCH 21/29] added unittest for get_groups() with single-valued input Series --- string_grouper/test/test_string_grouper.py | 37 ++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py index 383f4b11..4344177a 100644 --- a/string_grouper/test/test_string_grouper.py +++ b/string_grouper/test/test_string_grouper.py @@ -488,6 +488,43 @@ def test_get_groups_single_df_group_rep_default(self): ) ) + def test_get_groups_single_valued_series(self): + """This test ensures that get_groups() returns a single-valued DataFrame or Series object + since the input-series is also single-valued. This test was created in response to a bug discovered + by George Walker""" + pd.testing.assert_frame_equal( + pd.DataFrame([(0, "hello")], columns=['group_rep_index', 'group_rep']), + group_similar_strings( + pd.Series(["hello"]), + min_similarity=0.6 + ) + ) + pd.testing.assert_series_equal( + pd.Series(["hello"], name='group_rep'), + group_similar_strings( + pd.Series(["hello"]), + min_similarity=0.6, + ignore_index=True + ) + ) + pd.testing.assert_frame_equal( + pd.DataFrame([(0, "hello")], columns=['most_similar_index', 'most_similar_master']), + match_most_similar( + pd.Series(["hello"]), + pd.Series(["hello"]), + min_similarity=0.6 + ) + ) + pd.testing.assert_series_equal( + pd.Series(["hello"], name='most_similar_master'), + match_most_similar( + pd.Series(["hello"]), + pd.Series(["hello"]), + min_similarity=0.6, + ignore_index=True + ) + ) + def test_get_groups_single_df_keep_index(self): """Should return a pd.Series object with the same length as the original df. The series object will contain a list of the grouped strings with their indexes displayed in columns""" From 6f6ff50101a9383a73c07c66ba1b256f4d7edc5a Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Sat, 8 May 2021 10:11:13 +0200 Subject: [PATCH 22/29] fixed other squeeze() bugs --- string_grouper/string_grouper.py | 4 +-- string_grouper_topn/example/comparison2.py | 35 ++++++++++++++-------- 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index a2991475..c4cfbdef 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -277,7 +277,7 @@ def dot(self) -> pd.Series: raise Exception("To perform this function, both input Series must have the same length.") master_matrix, duplicate_matrix = self._get_tf_idf_matrices() # Calculate pairwise cosine similarities: - pairwise_similarities = np.asarray(master_matrix.multiply(duplicate_matrix).sum(axis=1)).squeeze() + pairwise_similarities = np.asarray(master_matrix.multiply(duplicate_matrix).sum(axis=1)).squeeze(axis=1) return pd.Series(pairwise_similarities, name='similarity', index=self._master.index) @validate_is_fit @@ -673,7 +673,7 @@ def _is_series_of_strings(series_to_test: pd.Series) -> bool: return False elif series_to_test.to_frame().applymap( lambda x: not isinstance(x, str) - ).squeeze().any(): + ).squeeze(axis=1).any(): return False return True diff --git a/string_grouper_topn/example/comparison2.py b/string_grouper_topn/example/comparison2.py index 5cc631f1..ca4e1fff 100644 --- a/string_grouper_topn/example/comparison2.py +++ b/string_grouper_topn/example/comparison2.py @@ -8,6 +8,8 @@ import pandas as pd from scipy.sparse import coo_matrix from string_grouper_topn import awesome_cossim_topn # noqa: F401 +from test.sortperf import flush +from _sqlite3 import Row df = pd.DataFrame(columns=['sample', '#threads', 'python']) @@ -16,7 +18,7 @@ nr_vocab = int(26**3) density = 30/nr_vocab -n_samples = 1000000 +n_samples = 10000000 n_duplicates = N nnz_a = int(n_samples * nr_vocab * density) nnz_b = int(n_duplicates * nr_vocab * density) @@ -38,22 +40,30 @@ ntop_arr = np.full(n_matrix_pairs, 0) r = 0 for it in range(n_matrix_pairs): + print('Building matrices ...', end='', flush=True) - row = rng1.randint(n_samples, size=nnz_a) - cols = rng1.randint(nr_vocab, size=nnz_a) - data = rng1.rand(nnz_a) + row = np.repeat(np.arange(n_samples), int(nr_vocab*density)) + cols = np.asarray([ rng1.randint(nr_vocab, size=int(nr_vocab*density)) for _ in range(n_samples) ]).flatten() + data = rng1.rand(len(row)) - a_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab)) - a = a_sparse.tocsr() + a = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab)) + a = a.tocsr() - row = rng1.randint(n_duplicates, size=nnz_b) - cols = rng1.randint(nr_vocab, size=nnz_b) - data = rng1.rand(nnz_b) + row = np.repeat(np.arange(n_duplicates), int(nr_vocab*density)) + cols = np.asarray([ rng1.randint(nr_vocab, size=int(nr_vocab*density)) for _ in range(n_duplicates) ]).flatten() + data = rng1.rand(len(row)) - b_sparse = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab)) - b = b_sparse.T.tocsr() + b = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab)) + b = b.T.tocsr() - C, C_ntop = awesome_cossim_topn(a, b, N, thresh, return_best_ntop=True) + del row + del cols + del data + + print('Finished.', flush=True) + + print('Computing matrix product ...', flush=True) + C, C_ntop = awesome_cossim_topn(a, b, N, thresh, return_best_ntop=True, use_threads=True, n_jobs=4) print(f'nnz(A*B) = {len(C.data)}', flush=True) print(f'ntop(A*B) = {C_ntop}', flush=True) print('', flush=True) @@ -61,6 +71,7 @@ ntop_arr[it] = C_ntop del C del C_ntop + print('Finished.', flush=True) # top 5 results per row From 90a6fd193ec8c4f6af730cd41c8af43befe80594 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Tue, 11 May 2021 10:00:17 +0200 Subject: [PATCH 23/29] made PEP8-conforming modifications --- string_grouper/string_grouper.py | 106 +++++++++--------- string_grouper/test/test_string_grouper.py | 26 +++-- string_grouper_topn/awesome_cossim_topn.py | 25 ++--- string_grouper_topn/example/comparison.py | 3 +- string_grouper_topn/example/comparison2.py | 56 +++++---- string_grouper_topn/example/comparison3.py | 11 +- .../test/test_awesome_cossim_topn.py | 4 +- string_grouper_utils/string_grouper_utils.py | 4 +- .../test/test_string_grouper_utils.py | 4 +- 9 files changed, 121 insertions(+), 118 deletions(-) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index c4cfbdef..7ebdaa82 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -4,6 +4,7 @@ import multiprocessing from sklearn.feature_extraction.text import TfidfVectorizer from scipy.sparse.csr import csr_matrix +from scipy.sparse.lil import lil_matrix from scipy.sparse.csgraph import connected_components from typing import Tuple, NamedTuple, List, Optional, Union from string_grouper_topn import awesome_cossim_topn @@ -17,24 +18,24 @@ DEFAULT_IGNORE_CASE: bool = True # ignores case by default DEFAULT_DROP_INDEX: bool = False # includes index-columns in output DEFAULT_REPLACE_NA: bool = False # when finding the most similar strings, does not replace NaN values in most - # similar string index-columns with corresponding duplicates-index values -DEFAULT_INCLUDE_ZEROES: bool = True # when the minimum cosine similarity <=0, determines whether zero-similarity - # matches appear in the output +# similar string index-columns with corresponding duplicates-index values +DEFAULT_INCLUDE_ZEROES: bool = True # when the minimum cosine similarity <=0, determines whether zero-similarity +# matches appear in the output GROUP_REP_CENTROID: str = 'centroid' # Option value to select the string in each group with the largest - # similarity aggregate as group-representative: +# similarity aggregate as group-representative: GROUP_REP_FIRST: str = 'first' # Option value to select the first string in each group as group-representative: -DEFAULT_GROUP_REP: str = GROUP_REP_CENTROID # chooses group centroid as group-representative by default +DEFAULT_GROUP_REP: str = GROUP_REP_CENTROID # chooses group centroid as group-representative by default # The following string constants are used by (but aren't [yet] options passed to) StringGrouper DEFAULT_COLUMN_NAME: str = 'side' # used to name non-index columns of the output of StringGrouper.get_matches -DEFAULT_ID_NAME: str = 'id' # used to name id-columns in the output of StringGrouper.get_matches +DEFAULT_ID_NAME: str = 'id' # used to name id-columns in the output of StringGrouper.get_matches LEFT_PREFIX: str = 'left_' # used to prefix columns on the left of the output of StringGrouper.get_matches RIGHT_PREFIX: str = 'right_' # used to prefix columns on the right of the output of StringGrouper.get_matches MOST_SIMILAR_PREFIX: str = 'most_similar_' # used to prefix columns of the output of - # StringGrouper._get_nearest_matches -DEFAULT_MASTER_NAME: str = 'master' # used to name non-index column of the output of StringGrouper.get_nearest_matches +# StringGrouper._get_nearest_matches +DEFAULT_MASTER_NAME: str = 'master' # used to name non-index column of the output of StringGrouper.get_nearest_matches DEFAULT_MASTER_ID_NAME: str = f'{DEFAULT_MASTER_NAME}_{DEFAULT_ID_NAME}' # used to name id-column of the output of - # StringGrouper.get_nearest_matches +# StringGrouper.get_nearest_matches GROUP_REP_PREFIX: str = 'group_rep_' # used to prefix and name columns of the output of StringGrouper._deduplicate # High level functions @@ -147,9 +148,9 @@ class StringGrouperConfig(NamedTuple): Defaults to number of cores on a machine - 1. :param ignore_case: bool. Whether or not case should be ignored. Defaults to True (ignore case). :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to False. - :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches + :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches appear in the output. Defaults to True. - :param replace_na: whether or not to replace NaN values in most similar string index-columns with + :param replace_na: whether or not to replace NaN values in most similar string index-columns with corresponding duplicates-index values. Defaults to False. :param group_rep: str. The scheme to select the group-representative. Default is 'centroid'. The other choice is 'first'. @@ -231,8 +232,8 @@ def __init__(self, master: pd.Series, self._vectorizer = TfidfVectorizer(min_df=1, analyzer=self.n_grams) # After the StringGrouper is built, _matches_list will contain the indices and similarities of the matches self._matches_list: pd.DataFrame = pd.DataFrame() - # _true_max_n_matches will contain the true maximum number of matches over all strings in master if - # self._config.min_similarity <= 0 + # _true_max_n_matches will contain the true maximum number of matches over all strings in master if + # self._config.min_similarity <= 0 self._true_max_n_matches = None def n_grams(self, string: str) -> List[str]: @@ -251,21 +252,21 @@ def n_grams(self, string: str) -> List[str]: def fit(self) -> 'StringGrouper': """Builds the _matches list which contains string matches indices and similarity""" master_matrix, duplicate_matrix = self._get_tf_idf_matrices() - + # Calculate the matches using the cosine similarity matches, self._true_max_n_matches = self._build_matches(master_matrix, duplicate_matrix) - + if self._duplicates is None: # convert to lil format for best efficiency when setting matrix-elements - matches = matches.tolil() - # matrix diagonal elements must be exactly 1 (numerical precision errors introduced by + matches = matches.tolil() + # matrix diagonal elements must be exactly 1 (numerical precision errors introduced by # floating-point computations in awesome_cossim_topn sometimes lead to unexpected results) matches = StringGrouper._fix_diagonal(matches) if self._max_n_matches < self._true_max_n_matches: # the list of matches must be symmetric! (i.e., if A != B and A matches B; then B matches A) matches = StringGrouper._symmetrize_matrix(matches) matches = matches.tocsr() - + # build list from matrix self._matches_list = self._get_matches_list(matches) self.is_build = True @@ -283,14 +284,14 @@ def dot(self) -> pd.Series: @validate_is_fit def get_matches(self, ignore_index: Optional[bool] = None, - include_zeroes: Optional[bool]=None) -> pd.DataFrame: + include_zeroes: Optional[bool] = None) -> pd.DataFrame: """ Returns a DataFrame with all the matches and their cosine similarity. If optional IDs are used, returned as extra columns with IDs matched to respective data rows - :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to + :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to self._config.ignore_index. - :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches + :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches appear in the output. Defaults to self._config.include_zeroes. """ def get_both_sides(master: pd.Series, @@ -313,18 +314,20 @@ def prefix_column_names(data: Union[pd.Series, pd.DataFrame], prefix: str): else: return data.rename(f"{prefix}{data.name}") - if ignore_index is None: ignore_index = self._config.ignore_index - if include_zeroes is None: include_zeroes = self._config.include_zeroes + if ignore_index is None: + ignore_index = self._config.ignore_index + if include_zeroes is None: + include_zeroes = self._config.include_zeroes if self._config.min_similarity > 0 or not include_zeroes: matches_list = self._matches_list elif include_zeroes: # Here's a fix to a bug pointed out by one GitHub user (@nbcvijanovic): - # the fix includes zero-similarity matches that are missing by default - # in _matches_list due to our use of sparse matrices + # the fix includes zero-similarity matches that are missing by default + # in _matches_list due to our use of sparse matrices non_matches_list = self._get_non_matches_list() matches_list = self._matches_list if non_matches_list.empty else \ pd.concat([self._matches_list, non_matches_list], axis=0, ignore_index=True) - + left_side, right_side = get_both_sides(self._master, self._duplicates, drop_index=ignore_index) similarity = matches_list.similarity.reset_index(drop=True) if self._master_id is None: @@ -366,16 +369,18 @@ def get_groups(self, If there are IDs (master_id and/or duplicates_id) then the IDs corresponding to the string outputs above are returned as well altogether in a DataFrame. - :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to + :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to self._config.ignore_index. - :param replace_na: whether or not to replace NaN values in most similar string index-columns with + :param replace_na: whether or not to replace NaN values in most similar string index-columns with corresponding duplicates-index values. Defaults to self._config.replace_na. """ - if ignore_index is None: ignore_index = self._config.ignore_index + if ignore_index is None: + ignore_index = self._config.ignore_index if self._duplicates is None: return self._deduplicate(ignore_index=ignore_index) else: - if replace_na is None: replace_na = self._config.replace_na + if replace_na is None: + replace_na = self._config.replace_na return self._get_nearest_matches(ignore_index=ignore_index, replace_na=replace_na) @validate_is_fit @@ -445,7 +450,7 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix """Builds the cossine similarity matrix of two csr matrices""" tf_idf_matrix_1 = master_matrix tf_idf_matrix_2 = duplicate_matrix.transpose() - + optional_kwargs = { 'return_best_ntop': True, 'use_threads': self._config.number_of_processes > 1, @@ -465,7 +470,8 @@ def _get_non_matches_list(self) -> pd.DataFrame: all_pairs = pd.MultiIndex.from_product([range(m_sz), range(d_sz)], names=['master_side', 'dupe_side']) matched_pairs = pd.MultiIndex.from_frame(self._matches_list[['master_side', 'dupe_side']]) missing_pairs = all_pairs.difference(matched_pairs) - if missing_pairs.empty: return pd.DataFrame() + if missing_pairs.empty: + return pd.DataFrame() if (self._max_n_matches < self._true_max_n_matches): raise Exception(f'\nERROR: Cannot return zero-similarity matches since \n' f'\t\t max_n_matches={self._max_n_matches} is too small!\n' @@ -483,8 +489,8 @@ def _get_nearest_matches(self, master_label = f'{prefix}{self._master.name if self._master.name else DEFAULT_MASTER_NAME}' master = self._master.rename(master_label).reset_index(drop=ignore_index) dupes = self._duplicates.rename('duplicates').reset_index(drop=ignore_index) - - # Rename new master-columns to avoid possible conflict with new dupes-columns when later merging + + # Rename new master-columns to avoid possible conflict with new dupes-columns when later merging if isinstance(dupes, pd.DataFrame): master.rename( columns={col: f'{prefix}{col}' for col in master.columns if str(col) != master_label}, @@ -514,14 +520,14 @@ def _get_nearest_matches(self, if self._master_id is not None: # Also update the master_id-series with the duplicates_id in cases were there is no match dupes_max_sim.loc[rows_to_update, master_id_label] = dupes_max_sim[rows_to_update].duplicates_id - + # For some weird reason, pandas' merge function changes int-datatype columns to float when NaN values # appear within them. So here we change them back to their original datatypes if possible: if dupes_max_sim[master_id_label].dtype != self._master_id.dtype and \ - self._duplicates_id.dtype == self._master_id.dtype: + self._duplicates_id.dtype == self._master_id.dtype: dupes_max_sim.loc[:, master_id_label] = \ - dupes_max_sim.loc[:, master_id_label].astype(self._master_id.dtype) - + dupes_max_sim.loc[:, master_id_label].astype(self._master_id.dtype) + # Prepare the output: required_column_list = [master_label] if self._master_id is None else [master_id_label, master_label] index_column_list = \ @@ -531,13 +537,13 @@ def _get_nearest_matches(self, # Update the master index-columns with the duplicates index-column values in cases were there is no match dupes_index_columns = [col for col in dupes.columns if str(col) != 'duplicates'] dupes_max_sim.loc[rows_to_update, index_column_list] = \ - dupes_max_sim.loc[rows_to_update, dupes_index_columns].values - + dupes_max_sim.loc[rows_to_update, dupes_index_columns].values + # Restore their original datatypes if possible: for m, d in zip(index_column_list, dupes_index_columns): if dupes_max_sim[m].dtype != master[m].dtype and dupes[d].dtype == master[m].dtype: dupes_max_sim.loc[:, m] = dupes_max_sim.loc[:, m].astype(master[m].dtype) - + # Make sure to keep same order as duplicates dupes_max_sim = dupes_max_sim.sort_values('dupe_side').set_index('dupe_side') output = dupes_max_sim[index_column_list + required_column_list] @@ -608,7 +614,7 @@ def _get_indices_of(self, master_side: str, dupe_side: str) -> Tuple[pd.Series, master_indices = master_strings[master_strings == master_side].index.to_series().reset_index(drop=True) dupe_indices = dupe_strings[dupe_strings == dupe_side].index.to_series().reset_index(drop=True) return master_indices, dupe_indices - + def _validate_group_rep_specs(self): group_rep_options = (GROUP_REP_FIRST, GROUP_REP_CENTROID) if self._config.group_rep not in group_rep_options: @@ -626,16 +632,16 @@ def _validate_replace_na_and_drop(self): ) @staticmethod - def _fix_diagonal(A) -> csr_matrix: - r = np.arange(A.shape[0]) - A[r, r] = 1 - return A + def _fix_diagonal(m: lil_matrix) -> csr_matrix: + r = np.arange(m.shape[0]) + m[r, r] = 1 + return m @staticmethod - def _symmetrize_matrix(A) -> csr_matrix: - r, c = A.nonzero() - A[c, r] = A[r, c] - return A + def _symmetrize_matrix(m_symmetric: lil_matrix) -> csr_matrix: + r, c = m_symmetric.nonzero() + m_symmetric[c, r] = m_symmetric[r, c] + return m_symmetric @staticmethod def _get_matches_list(matches: csr_matrix) -> pd.DataFrame: diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py index 4344177a..2438d679 100644 --- a/string_grouper/test/test_string_grouper.py +++ b/string_grouper/test/test_string_grouper.py @@ -9,8 +9,10 @@ compute_pairwise_similarities from unittest.mock import patch -def mock_symmetrize_matrix(A: csr_matrix) -> csr_matrix: - return A + +def mock_symmetrize_matrix(x: csr_matrix) -> csr_matrix: + return x + class SimpleExample(object): def __init__(self): @@ -201,13 +203,13 @@ def test_match_strings(self, mock_StringGouper): 'string_grouper.string_grouper.StringGrouper._symmetrize_matrix', side_effect=mock_symmetrize_matrix ) - def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_matrix): - """mocks StringGrouper._symmetrize_matches_list so that this test fails whenever _matches_list is + def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_matrix_param): + """mocks StringGrouper._symmetrize_matches_list so that this test fails whenever _matches_list is **partially** symmetric which often occurs when the kwarg max_n_matches is too small""" simple_example = SimpleExample() df = simple_example.customers_df2['Customer Name'] sg = StringGrouper(df, max_n_matches=2).fit() - mock_symmetrize_matrix.assert_called_once() + mock_symmetrize_matrix_param.assert_called_once() # obtain the upper and lower triangular parts of the matrix of matches: upper = sg._matches_list[sg._matches_list['master_side'] < sg._matches_list['dupe_side']] lower = sg._matches_list[sg._matches_list['master_side'] > sg._matches_list['dupe_side']] @@ -216,7 +218,7 @@ def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_m # obtain the intersection between upper and upper_prime: intersection = upper_prime.merge(upper, how='inner', on=['master_side', 'dupe_side']) # if the intersection is empty then _matches_list is completely non-symmetric (this is acceptable) - # if the intersection is not empty then at least some matches are repeated. + # if the intersection is not empty then at least some matches are repeated. # To make sure all (and not just some) matches are repeated, the lengths of # upper, upper_prime and their intersection should be identical. self.assertFalse(intersection.empty or len(upper) == len(upper_prime) == len(intersection)) @@ -234,7 +236,7 @@ def test_match_list_symmetry_with_symmetrize_function(self): # Obtain the intersection between upper and upper_prime: intersection = upper_prime.merge(upper, how='inner', on=['master_side', 'dupe_side']) # If the intersection is empty this means _matches_list is completely non-symmetric (this is acceptable) - # If the intersection is not empty this means at least some matches are repeated. + # If the intersection is not empty this means at least some matches are repeated. # To make sure all (and not just some) matches are repeated, the lengths of # upper, upper_prime and their intersection should be identical. self.assertTrue(intersection.empty or len(upper) == len(upper_prime) == len(intersection)) @@ -267,7 +269,7 @@ def test_match_list_diagonal(self): self.assertEqual(num_self_joins, num_strings) def test_zero_min_similarity(self): - """Since sparse matrices exclude zero elements, this test ensures that zero similarity matches are + """Since sparse matrices exclude zero elements, this test ensures that zero similarity matches are returned when min_similarity <= 0. A bug related to this was first pointed out by @nbcvijanovic""" simple_example = SimpleExample() s_master = simple_example.customers_df['Customer Name'] @@ -276,7 +278,7 @@ def test_zero_min_similarity(self): pd.testing.assert_frame_equal(simple_example.expected_result_with_zeroes, matches) def test_zero_min_similarity_small_max_n_matches(self): - """This test ensures that a warning is issued when n_max_matches is suspected to be too small while + """This test ensures that a warning is issued when n_max_matches is suspected to be too small while min_similarity <= 0 and include_zeroes is True""" simple_example = SimpleExample() s_master = simple_example.customers_df['Customer Name'] @@ -665,9 +667,9 @@ def test_get_groups_4_df_same_similarity(self): test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3']) test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3']) - sg = StringGrouper(test_series_1, - test_series_2, - master_id=test_series_id_1, + sg = StringGrouper(test_series_1, + test_series_2, + master_id=test_series_id_1, duplicates_id=test_series_id_2, ignore_index=True) sg = sg.fit() diff --git a/string_grouper_topn/awesome_cossim_topn.py b/string_grouper_topn/awesome_cossim_topn.py index 4f90ae63..65be44de 100644 --- a/string_grouper_topn/awesome_cossim_topn.py +++ b/string_grouper_topn/awesome_cossim_topn.py @@ -2,7 +2,6 @@ import numpy as np from scipy.sparse import csr_matrix from scipy.sparse import isspmatrix_csr -from numpy import indices if sys.version_info[0] >= 3: from string_grouper_topn import sparse_dot_topn as ct @@ -27,13 +26,13 @@ def awesome_cossim_topn( lower_bound: a threshold that the element of A*B must be greater than use_threads: use multi-thread or not n_jobs: number of thread, must be >= 1 - return_best_ntop: (default: False) if True, will return best_ntop together + return_best_ntop: (default: False) if True, will return best_ntop together with C as a tuple: (C, best_ntop) Output: C: result matrix (returned alone, if return_best_ntop=False) - best_ntop: The true maximum number of elements > lower_bound per row of - A * B returned together with C as a tuple: (C, best_ntop). It is + best_ntop: The true maximum number of elements > lower_bound per row of + A * B returned together with C as a tuple: (C, best_ntop). It is returned only if return_best_ntop=True. N.B. if A and B are not in CSR format, they will be converted to CSR @@ -42,10 +41,11 @@ def try_malloc(sz: int, idx_dtype, data_dtype) -> bool: try: ind_arr = np.empty(sz, dtype=idx_dtype) dat_arr = np.empty(sz, dtype=data_dtype) + del ind_arr, dat_arr return True except MemoryError: return False - + if not isspmatrix_csr(A): A = A.tocsr() if not isspmatrix_csr(B): @@ -74,22 +74,22 @@ def try_malloc(sz: int, idx_dtype, data_dtype) -> bool: return output indptr = np.empty(M + 1, dtype=idx_dtype) - + # reduce nnz_max if too large to fit in available memory: while (not try_malloc(nnz_max, idx_dtype, A.dtype)): nnz_max = nnz_max//2 # take a chance on high matrix-sparsity and reduce further: nnz_max = max(M, nnz_max//16) - + # filled matrices from here on indices = np.empty(nnz_max, dtype=idx_dtype) data = np.empty(nnz_max, dtype=A.dtype) - + best_ntop_arr = np.full(1, 0, dtype=idx_dtype) - + if not use_threads: - + alt_indices, alt_data = ct.sparse_dot_topn_extd( M, N, np.asarray(A.indptr, dtype=idx_dtype), np.asarray(A.indices, dtype=idx_dtype), @@ -118,15 +118,14 @@ def try_malloc(sz: int, idx_dtype, data_dtype) -> bool: lower_bound, indptr, indices, data, best_ntop_arr, n_jobs ) - + if alt_indices is not None: indices = alt_indices data = alt_data - + # prepare and return the output: output = csr_matrix((data, indices, indptr), shape=(M, N)) if return_best_ntop: return output, best_ntop_arr[0] else: return output - diff --git a/string_grouper_topn/example/comparison.py b/string_grouper_topn/example/comparison.py index ce3cc0ad..d2d41efc 100644 --- a/string_grouper_topn/example/comparison.py +++ b/string_grouper_topn/example/comparison.py @@ -6,7 +6,6 @@ import timeit import numpy as np from scipy.sparse import coo_matrix -from string_grouper_topn import awesome_cossim_topn # noqa: F401 N = 1000 thresh = 0.01 @@ -122,7 +121,7 @@ def get_csr_ntop_idx_data(csr_row, ntop): return sorted(result, key=lambda x: -x[1]) -def scipy_cossim_top(A, B, ntop, lower_bound=0): +def scipy_cossim_top(A, B, ntop): C = A.dot(B) return [get_csr_ntop_idx_data(row, ntop) for row in C] diff --git a/string_grouper_topn/example/comparison2.py b/string_grouper_topn/example/comparison2.py index ca4e1fff..75e99461 100644 --- a/string_grouper_topn/example/comparison2.py +++ b/string_grouper_topn/example/comparison2.py @@ -8,8 +8,6 @@ import pandas as pd from scipy.sparse import coo_matrix from string_grouper_topn import awesome_cossim_topn # noqa: F401 -from test.sortperf import flush -from _sqlite3 import Row df = pd.DataFrame(columns=['sample', '#threads', 'python']) @@ -41,25 +39,25 @@ r = 0 for it in range(n_matrix_pairs): print('Building matrices ...', end='', flush=True) - + row = np.repeat(np.arange(n_samples), int(nr_vocab*density)) - cols = np.asarray([ rng1.randint(nr_vocab, size=int(nr_vocab*density)) for _ in range(n_samples) ]).flatten() + cols = np.asarray([rng1.randint(nr_vocab, size=int(nr_vocab*density)) for _ in range(n_samples)]).flatten() data = rng1.rand(len(row)) - + a = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab)) a = a.tocsr() - + row = np.repeat(np.arange(n_duplicates), int(nr_vocab*density)) - cols = np.asarray([ rng1.randint(nr_vocab, size=int(nr_vocab*density)) for _ in range(n_duplicates) ]).flatten() + cols = np.asarray([rng1.randint(nr_vocab, size=int(nr_vocab*density)) for _ in range(n_duplicates)]).flatten() data = rng1.rand(len(row)) - + b = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab)) b = b.T.tocsr() - + del row del cols del data - + print('Finished.', flush=True) print('Computing matrix product ...', flush=True) @@ -72,11 +70,11 @@ del C del C_ntop print('Finished.', flush=True) - + # top 5 results per row - + print("Non-parallelized sparse_dot_topn function") - + rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh)', number=3, globals=globals()) @@ -84,9 +82,9 @@ r += 1 print('sample\t\tpython', flush=True) print(f'{it}\t\t{rtv:7.4f}', flush=True) - + print("Threaded function with 1 thread") - + rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1)', number=3, globals=globals()) @@ -94,9 +92,9 @@ r += 1 print('sample\t\tpython', flush=True) print(f'{it}\t\t{rtv:7.4f}', flush=True) - + print("Threaded function with 2 threads") - + rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2)', number=3, globals=globals()) @@ -104,9 +102,9 @@ r += 1 print('sample\t\tpython', flush=True) print(f'{it}\t\t{rtv:7.4f}', flush=True) - + print("Threaded function with 3 threads") - + rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3)', number=3, globals=globals()) @@ -114,9 +112,9 @@ r += 1 print('sample\t\tpython', flush=True) print(f'{it}\t\t{rtv:7.4f}', flush=True) - + print("Threaded function with 4 threads") - + rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4)', number=3, globals=globals()) @@ -124,9 +122,9 @@ r += 1 print('sample\t\tpython', flush=True) print(f'{it}\t\t{rtv:7.4f}', flush=True) - + print("Threaded function with 5 threads") - + rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5)', number=3, globals=globals()) @@ -134,9 +132,9 @@ r += 1 print('sample\t\tpython', flush=True) print(f'{it}\t\t{rtv:7.4f}', flush=True) - + print("Threaded function with 6 threads") - + rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6)', number=3, globals=globals()) @@ -144,9 +142,9 @@ r += 1 print('sample\t\tpython', flush=True) print(f'{it}\t\t{rtv:7.4f}', flush=True) - + print("Threaded function with 7 threads") - + rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7)', number=3, globals=globals()) @@ -154,7 +152,7 @@ r += 1 print('sample\t\tpython', flush=True) print(f'{it}\t\t{rtv:7.4f}', flush=True) - + print('') print(f'nnz(A*B) = {nnz_arr[:(it + 1)].mean()} +/- {nnz_arr[:(it + 1)].std()}') print(f'ntop(A*B) = {ntop_arr[:(it + 1)].mean()} +/- {ntop_arr[:(it + 1)].std()}') @@ -162,7 +160,7 @@ df = df.astype({ 'sample': np.int64, '#threads': np.int64, 'python': np.float64}) results = df.groupby('#threads', as_index=True, sort=True)[['python']].mean() - + print(results) print('') print('') diff --git a/string_grouper_topn/example/comparison3.py b/string_grouper_topn/example/comparison3.py index 74983dde..de0984f4 100644 --- a/string_grouper_topn/example/comparison3.py +++ b/string_grouper_topn/example/comparison3.py @@ -3,10 +3,10 @@ """ from __future__ import print_function -import timeit +# import timeit import time import numpy as np -import pandas as pd +# import pandas as pd from scipy.sparse import load_npz from string_grouper_topn import awesome_cossim_topn # noqa: F401 @@ -24,8 +24,8 @@ thresh = 0.8 nr_vocab = b.shape[0] -density_A = len(a.data)/(a.shape[0]*a.shape[1]) -density_B = len(b.data)/(b.shape[0]*b.shape[1]) +density_A = len(a.data)/(a.shape[0]*a.shape[1]) +density_B = len(b.data)/(b.shape[0]*b.shape[1]) n_samples = a.shape[0] n_duplicates = b.shape[1] nnz_a = len(a.data) @@ -49,11 +49,10 @@ it = 0 tic = time.perf_counter() -C, C_ntop = awesome_cossim_topn(a, b, N, thresh, use_threads=True, n_jobs = 7, return_best_ntop=True) +C, C_ntop = awesome_cossim_topn(a, b, N, thresh, use_threads=True, n_jobs=7, return_best_ntop=True) toc = time.perf_counter() print('scout_nnz=True, use_threads=True, n_jobs = 7') print(f'nnz(A*B) = {len(C.data)}', flush=True) print(f'ntop(A*B) = {C_ntop}', flush=True) print(f'duration(A*B) = {(toc - tic):0.4f}', flush=True) - diff --git a/string_grouper_topn/test/test_awesome_cossim_topn.py b/string_grouper_topn/test/test_awesome_cossim_topn.py index 80a71431..ffb17915 100644 --- a/string_grouper_topn/test/test_awesome_cossim_topn.py +++ b/string_grouper_topn/test/test_awesome_cossim_topn.py @@ -39,7 +39,7 @@ def helper_awesome_cossim_topn_dense( b_dense, use_threads=False, n_jobs=1 - ): + ): dense_result = np.dot(a_dense, np.transpose(b_dense)) # dot product sparse_result = csr_matrix(dense_result) sparse_result_top3 = [get_n_top_sparse(row, NUM_CANDIDATES) @@ -115,7 +115,7 @@ def helper_awesome_cossim_topn_sparse( flag=True, use_threads=False, n_jobs=1 - ): + ): # Note: helper function using awesome_cossim_topn sparse_result = a_sparse.dot(b_sparse.T) # dot product sparse_result_top3 = [get_n_top_sparse(row, NUM_CANDIDATES) diff --git a/string_grouper_utils/string_grouper_utils.py b/string_grouper_utils/string_grouper_utils.py index a570b377..e674367b 100644 --- a/string_grouper_utils/string_grouper_utils.py +++ b/string_grouper_utils/string_grouper_utils.py @@ -137,8 +137,8 @@ def get_column(col: Union[str, int, List[Union[str, int]]], data: pd.DataFrame): def parse_timestamps(timestamps: pd.Series, parserinfo=None, **kwargs) -> pd.Series: - error_msg = f"timestamps must be a Series of date-like or datetime-like strings" - error_msg += f" or datetime datatype or pandas Timestamp datatype or numbers" + error_msg = "timestamps must be a Series of date-like or datetime-like strings" + error_msg += " or datetime datatype or pandas Timestamp datatype or numbers" if is_series_of_type(str, timestamps): # if any of the strings is not datetime-like raise an exception if timestamps.to_frame().applymap(is_date).squeeze().all(): diff --git a/string_grouper_utils/test/test_string_grouper_utils.py b/string_grouper_utils/test/test_string_grouper_utils.py index 3798e3cd..0c8a8ee4 100644 --- a/string_grouper_utils/test/test_string_grouper_utils.py +++ b/string_grouper_utils/test/test_string_grouper_utils.py @@ -1,8 +1,8 @@ import unittest import pandas as pd from dateutil.parser import parse -from string_grouper_utils.string_grouper_utils import new_group_rep_by_earliest_timestamp, new_group_rep_by_completeness, \ - new_group_rep_by_highest_weight +from string_grouper_utils.string_grouper_utils import new_group_rep_by_earliest_timestamp, \ + new_group_rep_by_completeness, new_group_rep_by_highest_weight class SimpleExample(object): From 32d7136db5467beebc2d0468032b29f248ac2e46 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Thu, 10 Jun 2021 15:39:56 +0200 Subject: [PATCH 24/29] removed string_grouper_topn submodule --- setup.py | 79 +-- string_grouper_topn/__init__.py | 7 - string_grouper_topn/array_wrappers.pxd | 16 - string_grouper_topn/array_wrappers.pyx | 73 --- string_grouper_topn/awesome_cossim_topn.py | 132 ---- string_grouper_topn/example/comparison.py | 136 ----- string_grouper_topn/example/comparison2.py | 166 ----- string_grouper_topn/example/comparison3.py | 58 -- string_grouper_topn/example/example.py | 14 - string_grouper_topn/sparse_dot_topn.pyx | 261 -------- .../sparse_dot_topn_parallel.cpp | 571 ------------------ .../sparse_dot_topn_parallel.h | 78 --- .../sparse_dot_topn_source.cpp | 446 -------------- string_grouper_topn/sparse_dot_topn_source.h | 80 --- .../sparse_dot_topn_threaded.pyx | 190 ------ .../test/test_awesome_cossim_topn.py | 313 ---------- 16 files changed, 8 insertions(+), 2612 deletions(-) delete mode 100644 string_grouper_topn/__init__.py delete mode 100644 string_grouper_topn/array_wrappers.pxd delete mode 100644 string_grouper_topn/array_wrappers.pyx delete mode 100644 string_grouper_topn/awesome_cossim_topn.py delete mode 100644 string_grouper_topn/example/comparison.py delete mode 100644 string_grouper_topn/example/comparison2.py delete mode 100644 string_grouper_topn/example/comparison3.py delete mode 100644 string_grouper_topn/example/example.py delete mode 100644 string_grouper_topn/sparse_dot_topn.pyx delete mode 100644 string_grouper_topn/sparse_dot_topn_parallel.cpp delete mode 100644 string_grouper_topn/sparse_dot_topn_parallel.h delete mode 100644 string_grouper_topn/sparse_dot_topn_source.cpp delete mode 100644 string_grouper_topn/sparse_dot_topn_source.h delete mode 100644 string_grouper_topn/sparse_dot_topn_threaded.pyx delete mode 100644 string_grouper_topn/test/test_awesome_cossim_topn.py diff --git a/setup.py b/setup.py index cf333180..f4b5ecb0 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,5 @@ -from setuptools import setup, Extension +from setuptools import setup import pathlib -import os # The directory containing this file HERE = pathlib.Path(__file__).parent @@ -8,65 +7,13 @@ # The text of the README file README = (HERE / "README.md").read_text() -# workaround for numpy and Cython install dependency -# the solution is from https://stackoverflow.com/a/54138355 -def my_build_ext(pars): - # import delayed: - from setuptools.command.build_ext import build_ext as _build_ext - class build_ext(_build_ext): - def finalize_options(self): - _build_ext.finalize_options(self) - # Prevent numpy from thinking it is still in its setup process: - __builtins__.__NUMPY_SETUP__ = False - import numpy - self.include_dirs.append(numpy.get_include()) - - #object returned: - return build_ext(pars) - -if os.name == 'nt': - extra_compile_args = ["-Ox"] -else: - extra_compile_args = ['-std=c++0x', '-pthread', '-O3'] - -array_wrappers_ext = Extension('string_grouper_topn.array_wrappers', - sources=[ - './string_grouper_topn/array_wrappers.pyx', - ], - extra_compile_args=extra_compile_args, - language='c++') - -original_ext = Extension('string_grouper_topn.sparse_dot_topn', - sources=[ - './string_grouper_topn/sparse_dot_topn.pyx', - './string_grouper_topn/sparse_dot_topn_source.cpp', - ], - extra_compile_args=extra_compile_args, - define_macros=[('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')], - language='c++') - -threaded_ext = Extension('string_grouper_topn.sparse_dot_topn_threaded', - sources=[ - './string_grouper_topn/sparse_dot_topn_threaded.pyx', - './string_grouper_topn/sparse_dot_topn_source.cpp', - './string_grouper_topn/sparse_dot_topn_parallel.cpp', - ], - extra_compile_args=extra_compile_args, - define_macros=[('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')], - language='c++') - setup( name='string_grouper', version='0.4.0', - packages=[ - 'string_grouper_topn', - 'string_grouper', - 'string_grouper_utils', - ], + packages=['string_grouper'], license='MIT License', description='String grouper contains functions to do string matching using TF-IDF and the cossine similarity. ' 'Based on https://bergvca.github.io/2017/10/14/super-fast-string-matching.html', - keywords='cosine-similarity sparse-matrix sparse-graph scipy cython', author='Chris van den Berg', long_description=README, long_description_content_type="text/markdown", @@ -74,20 +21,10 @@ def finalize_options(self): url='https://github.com/Bergvca/string_grouper', zip_safe=False, python_requires='>3.7', - setup_requires=[# Setuptools 18.0 properly handles Cython extensions. - 'setuptools>=18.0', - 'cython>=0.29.15', - 'numpy', - 'scipy', - ], - install_requires=[# Setuptools 18.0 properly handles Cython extensions. - 'setuptools>=18.0', - 'cython>=0.29.15', - 'numpy', - 'scipy', - 'scikit-learn', - 'pandas>=0.25.3', - ], - cmdclass={'build_ext': my_build_ext}, - ext_modules=[array_wrappers_ext, original_ext, threaded_ext], + install_requires=['pandas>=0.25.3' + , 'scipy' + , 'scikit-learn' + , 'numpy' + , 'sparse_dot_topn>=0.2.6' + ] ) diff --git a/string_grouper_topn/__init__.py b/string_grouper_topn/__init__.py deleted file mode 100644 index b123439e..00000000 --- a/string_grouper_topn/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# flake8: noqa -import sys - -if sys.version_info[0] >= 3: - from string_grouper_topn.awesome_cossim_topn import awesome_cossim_topn -else: - from awesome_cossim_topn import awesome_cossim_topn \ No newline at end of file diff --git a/string_grouper_topn/array_wrappers.pxd b/string_grouper_topn/array_wrappers.pxd deleted file mode 100644 index 3af1a3c4..00000000 --- a/string_grouper_topn/array_wrappers.pxd +++ /dev/null @@ -1,16 +0,0 @@ -from libcpp.vector cimport vector - -# define a Cython array wrapper class to hold a C++ vector of ints, adhering to numpy's buffer protocol: -cdef class ArrayWrapper_int: - cdef int view_count - cdef vector[int] vec - cdef Py_ssize_t shape[2] - cdef Py_ssize_t strides[2] - - -# define a Cython array wrapper class to hold a C++ vector of doubles, adhering to numpy's buffer protocol: -cdef class ArrayWrapper_double: - cdef int view_count - cdef vector[double] vec - cdef Py_ssize_t shape[2] - cdef Py_ssize_t strides[2] diff --git a/string_grouper_topn/array_wrappers.pyx b/string_grouper_topn/array_wrappers.pyx deleted file mode 100644 index 18525766..00000000 --- a/string_grouper_topn/array_wrappers.pyx +++ /dev/null @@ -1,73 +0,0 @@ -from cpython cimport Py_buffer -from libcpp.vector cimport vector - -# define a Cython array wrapper class to hold a C++ vector of ints, adhering to numpy's buffer protocol: -cdef class ArrayWrapper_int: - # constructor and destructor are fairly unimportant now since - # vec will be destroyed automatically. - - def __cinit__(self, vector[int]& data): - self.vec.swap(data) - self.view_count = 0 - - # now implement the buffer protocol for the class - # which makes it generally useful to anything that expects an array - def __getbuffer__(self, Py_buffer *buffer, int flags): - # relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class - cdef Py_ssize_t itemsize = sizeof(self.vec[0]) - - self.shape[1] = self.vec.size() - self.shape[0] = 1 - self.strides[1] = ( &(self.vec[1]) - &(self.vec[0])) - self.strides[0] = self.vec.size() * self.strides[1] - buffer.buf = &(self.vec[0]) - buffer.format = 'i' - buffer.internal = NULL - buffer.itemsize = itemsize - buffer.len = self.vec.size() * itemsize # product(shape) * itemsize - buffer.ndim = 2 - buffer.obj = self - buffer.readonly = 0 - buffer.shape = self.shape - buffer.strides = self.strides - buffer.suboffsets = NULL - self.view_count += 1 - - def __releasebuffer__(self, Py_buffer *buffer): - self.view_count -= 1 - - -# define a Cython array wrapper class to hold a C++ vector of doubles, adhering to numpy's buffer protocol: -cdef class ArrayWrapper_double: - # constructor and destructor are fairly unimportant now since - # vec will be destroyed automatically. - - def __cinit__(self, vector[double]& data): - self.vec.swap(data) - self.view_count = 0 - - # now implement the buffer protocol for the class - # which makes it generally useful to anything that expects an array - def __getbuffer__(self, Py_buffer *buffer, int flags): - # relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class - cdef Py_ssize_t itemsize = sizeof(self.vec[0]) - - self.shape[1] = self.vec.size() - self.shape[0] = 1 - self.strides[1] = ( &(self.vec[1]) - &(self.vec[0])) - self.strides[0] = self.vec.size() * self.strides[1] - buffer.buf = &(self.vec[0]) - buffer.format = 'd' - buffer.internal = NULL - buffer.itemsize = itemsize - buffer.len = self.vec.size() * itemsize # product(shape) * itemsize - buffer.ndim = 2 - buffer.obj = self - buffer.readonly = 0 - buffer.shape = self.shape - buffer.strides = self.strides - buffer.suboffsets = NULL - self.view_count += 1 - - def __releasebuffer__(self, Py_buffer *buffer): - self.view_count -= 1 diff --git a/string_grouper_topn/awesome_cossim_topn.py b/string_grouper_topn/awesome_cossim_topn.py deleted file mode 100644 index 09ee7917..00000000 --- a/string_grouper_topn/awesome_cossim_topn.py +++ /dev/null @@ -1,132 +0,0 @@ -import sys -import numpy as np -from scipy.sparse import csr_matrix -from scipy.sparse import isspmatrix_csr - -if sys.version_info[0] >= 3: - from string_grouper_topn import sparse_dot_topn as ct - from string_grouper_topn import sparse_dot_topn_threaded as ct_thread -else: - import sparse_dot_topn as ct - import sparse_dot_topn_threaded as ct_thread - - -def awesome_cossim_topn( - A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1, return_best_ntop=False): - """ - This function will return a matrix C in CSR format, where - C = [sorted top n results > lower_bound for each row of A * B]. - If return_best_ntop=True then best_ntop - (the true maximum number of elements > lower_bound per row of A * B) - will also be returned in a tuple together with C as (C, best_ntop). - - Input: - A and B: two CSR matrices - ntop: top n results - lower_bound: a threshold that the element of A*B must be greater than - use_threads: use multi-thread or not - n_jobs: number of thread, must be >= 1 - return_best_ntop: (default: False) if True, will return best_ntop together - with C as a tuple: (C, best_ntop) - - Output: - C: result matrix (returned alone, if return_best_ntop=False) - best_ntop: The true maximum number of elements > lower_bound per row of - A * B returned together with C as a tuple: (C, best_ntop). It is - returned only if return_best_ntop=True. - - N.B. if A and B are not in CSR format, they will be converted to CSR - """ - def try_malloc(sz: int, idx_dtype, data_dtype) -> bool: - try: - ind_arr = np.empty(sz, dtype=idx_dtype) - dat_arr = np.empty(sz, dtype=data_dtype) - del ind_arr, dat_arr - return True - except MemoryError: - return False - - if not isspmatrix_csr(A): - A = A.tocsr() - if not isspmatrix_csr(B): - B = B.tocsr() - - M, K1 = A.shape - K2, N = B.shape - - if K1 != K2: - err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!' - raise ValueError(err_str) - - idx_dtype = np.int32 - - nnz_max = M*ntop - - # basic check. if A or B are all zeros matrix, return all zero matrix directly - if len(A.indices) == 0 or len(B.indices) == 0: - indptr = np.zeros(M + 1, dtype=idx_dtype) - indices = np.zeros(nnz_max, dtype=idx_dtype) - data = np.zeros(nnz_max, dtype=A.dtype) - output = csr_matrix((data, indices, indptr), shape=(M, N)) - if return_best_ntop: - return output, 0 - else: - return output - - indptr = np.empty(M + 1, dtype=idx_dtype) - - # reduce nnz_max if too large to fit in available memory: - nnz_max = 16*nnz_max - while (not try_malloc(nnz_max, idx_dtype, A.dtype)): - nnz_max = nnz_max//2 - - # take a chance on high matrix-sparsity and reduce further: - nnz_max = max(M, nnz_max//16) - - # filled matrices from here on - indices = np.empty(nnz_max, dtype=idx_dtype) - data = np.empty(nnz_max, dtype=A.dtype) - - best_ntop_arr = np.full(1, 0, dtype=idx_dtype) - - if not use_threads: - - alt_indices, alt_data = ct.sparse_dot_topn_extd( - M, N, np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - A.data, - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - B.data, - ntop, - lower_bound, - indptr, indices, data, best_ntop_arr - ) - - else: - if n_jobs < 1: - err_str = 'Whenever you select the multi-thread mode, n_job must be greater than or equal to 1!' - raise ValueError(err_str) - - alt_indices, alt_data = ct_thread.sparse_dot_topn_extd_threaded( - M, N, np.asarray(A.indptr, dtype=idx_dtype), - np.asarray(A.indices, dtype=idx_dtype), - A.data, - np.asarray(B.indptr, dtype=idx_dtype), - np.asarray(B.indices, dtype=idx_dtype), - B.data, - ntop, - lower_bound, - indptr, indices, data, best_ntop_arr, n_jobs - ) - - if alt_indices is not None: - indices = alt_indices - data = alt_data - - # prepare and return the output: - output = csr_matrix((data, indices, indptr), shape=(M, N)) - if return_best_ntop: - return output, best_ntop_arr[0] - else: - return output diff --git a/string_grouper_topn/example/comparison.py b/string_grouper_topn/example/comparison.py deleted file mode 100644 index d2d41efc..00000000 --- a/string_grouper_topn/example/comparison.py +++ /dev/null @@ -1,136 +0,0 @@ -""" -This file compare our boosting method with calling scipy+numpy function directly -""" - -from __future__ import print_function -import timeit -import numpy as np -from scipy.sparse import coo_matrix - -N = 1000 -thresh = 0.01 - -nr_vocab = 2 << 24 -density = 1e-6 -n_samples = 1000000 -n_duplicates = 1000000 -nnz_a = int(n_samples * nr_vocab * density) -nnz_b = int(n_duplicates * nr_vocab * density) - - -print(f'density = {density}', flush=True) -print(f'nr_vocab = {nr_vocab}', flush=True) -print(f'n_samples = {n_samples}', flush=True) -print(f'n_duplicates = {n_duplicates}', flush=True) -print(f'nnz_a = {nnz_a}', flush=True) -print(f'nnz_b = {nnz_b}', flush=True) -print('\n', flush=True) - -rng1 = np.random.RandomState(42) -rng2 = np.random.RandomState(43) - -row = rng1.randint(n_samples, size=nnz_a) -cols = rng2.randint(nr_vocab, size=nnz_a) -data = rng1.rand(nnz_a) - -a_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab)) -a = a_sparse.tocsr() - -row = rng1.randint(n_duplicates, size=nnz_b) -cols = rng2.randint(nr_vocab, size=nnz_b) -data = rng1.rand(nnz_b) - -b_sparse = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab)) -b = b_sparse.T.tocsr() - - -# top 5 results per row - -print("Original sparse_dot_topn function") - -rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh)', - number=3, - globals=globals()) -print(rtv) - -print("Threaded function with 1 thread") - -rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1)', - number=3, - globals=globals()) -print(rtv) - -print("Threaded function with 2 threads") - -rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2)', - number=3, - globals=globals()) -print(rtv) - -print("Threaded function with 3 threads") - -rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3)', - number=3, - globals=globals()) -print(rtv) - -print("Threaded function with 4 threads") - -rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4)', - number=3, - globals=globals()) -print(rtv) - -print("Threaded function with 5 threads") - -rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5)', - number=3, - globals=globals()) -print(rtv) - -print("Threaded function with 6 threads") - -rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6)', - number=3, - globals=globals()) -print(rtv) - -print("Threaded function with 7 threads") - -rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7)', - number=3, - globals=globals()) -print(rtv) - -# use scipy and numpy function - - -def get_csr_ntop_idx_data(csr_row, ntop): - """ - Get list (row index, score) of the n top matches - """ - nnz = csr_row.getnnz() - if nnz == 0: - return None - elif nnz <= ntop: - result = zip(csr_row.indices, csr_row.data) - else: - arg_idx = np.argpartition(csr_row.data, -ntop)[-ntop:] - result = zip(csr_row.indices[arg_idx], csr_row.data[arg_idx]) - - return sorted(result, key=lambda x: -x[1]) - - -def scipy_cossim_top(A, B, ntop): - C = A.dot(B) - return [get_csr_ntop_idx_data(row, ntop) for row in C] - -# top 5 results per row which element is greater than 2 - - -print("Scipy+numpy original function") - -rtv = timeit.timeit('scipy_cossim_top(a, b, N, thresh)', - number=3, - globals=globals()) -print(rtv) diff --git a/string_grouper_topn/example/comparison2.py b/string_grouper_topn/example/comparison2.py deleted file mode 100644 index 557eedd2..00000000 --- a/string_grouper_topn/example/comparison2.py +++ /dev/null @@ -1,166 +0,0 @@ -""" -This file compare our boosting method with calling scipy+numpy function directly -""" - -from __future__ import print_function -import timeit -import numpy as np -import pandas as pd -from scipy.sparse import coo_matrix -from string_grouper_topn import awesome_cossim_topn # noqa: F401 - -df = pd.DataFrame(columns=['sample', '#threads', 'python']) - -N = 4000 -thresh = 0.01 - -nr_vocab = int(26**3) -density = 30/nr_vocab -n_samples = 1000000 -n_duplicates = N -nnz_a = int(n_samples * nr_vocab * density) -nnz_b = int(n_duplicates * nr_vocab * density) - -print(f'ntop = {N}', flush=True) -print(f'threshold = {thresh}', flush=True) -print(f'density = {density}', flush=True) -print(f'nr_vocab = {nr_vocab}', flush=True) -print(f'n_samples = {n_samples}', flush=True) -print(f'n_duplicates = {n_duplicates}', flush=True) -print(f'nnz_A = {nnz_a}', flush=True) -print(f'nnz_B = {nnz_b}', flush=True) -print('', flush=True) - -rng1 = np.random.RandomState(42) - -n_matrix_pairs = 2**4 -nnz_arr = np.full(n_matrix_pairs, 0) -ntop_arr = np.full(n_matrix_pairs, 0) -r = 0 -for it in range(n_matrix_pairs): - print('Building matrices ...', end='', flush=True) - - row = np.repeat(np.arange(n_samples), int(nr_vocab*density)) - cols = np.asarray([rng1.randint(nr_vocab, size=int(nr_vocab*density)) for _ in range(n_samples)]).flatten() - data = rng1.rand(len(row)) - - a = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab)) - a = a.tocsr() - - row = np.repeat(np.arange(n_duplicates), int(nr_vocab*density)) - cols = np.asarray([rng1.randint(nr_vocab, size=int(nr_vocab*density)) for _ in range(n_duplicates)]).flatten() - data = rng1.rand(len(row)) - - b = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab)) - b = b.T.tocsr() - - del row - del cols - del data - - print('Finished.', flush=True) - - print('Computing matrix product ...', flush=True) - C, C_ntop = awesome_cossim_topn(a, b, N, thresh, return_best_ntop=True, use_threads=True, n_jobs=4) - print(f'nnz(A*B) = {len(C.data)}', flush=True) - print(f'ntop(A*B) = {C_ntop}', flush=True) - print('', flush=True) - nnz_arr[it] = len(C.data) - ntop_arr[it] = C_ntop - del C - del C_ntop - print('Finished.', flush=True) - - # top 5 results per row - - print("Non-parallelized sparse_dot_topn function") - - rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh)', - number=3, - globals=globals()) - df.loc[r] = [it, 0, rtv] - r += 1 - print('sample\t\tpython', flush=True) - print(f'{it}\t\t{rtv:7.4f}', flush=True) - - print("Threaded function with 1 thread") - - rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1)', - number=3, - globals=globals()) - df.loc[r] = [it, 1, rtv] - r += 1 - print('sample\t\tpython', flush=True) - print(f'{it}\t\t{rtv:7.4f}', flush=True) - - print("Threaded function with 2 threads") - - rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2)', - number=3, - globals=globals()) - df.loc[r] = [it, 2, rtv] - r += 1 - print('sample\t\tpython', flush=True) - print(f'{it}\t\t{rtv:7.4f}', flush=True) - - print("Threaded function with 3 threads") - - rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3)', - number=3, - globals=globals()) - df.loc[r] = [it, 3, rtv] - r += 1 - print('sample\t\tpython', flush=True) - print(f'{it}\t\t{rtv:7.4f}', flush=True) - - print("Threaded function with 4 threads") - - rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4)', - number=3, - globals=globals()) - df.loc[r] = [it, 4, rtv] - r += 1 - print('sample\t\tpython', flush=True) - print(f'{it}\t\t{rtv:7.4f}', flush=True) - - print("Threaded function with 5 threads") - - rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5)', - number=3, - globals=globals()) - df.loc[r] = [it, 5, rtv] - r += 1 - print('sample\t\tpython', flush=True) - print(f'{it}\t\t{rtv:7.4f}', flush=True) - - print("Threaded function with 6 threads") - - rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6)', - number=3, - globals=globals()) - df.loc[r] = [it, 6, rtv] - r += 1 - print('sample\t\tpython', flush=True) - print(f'{it}\t\t{rtv:7.4f}', flush=True) - - print("Threaded function with 7 threads") - - rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7)', - number=3, - globals=globals()) - df.loc[r] = [it, 7, rtv] - r += 1 - print('sample\t\tpython', flush=True) - print(f'{it}\t\t{rtv:7.4f}', flush=True) - - print('') - print(f'nnz(A*B) = {nnz_arr[:(it + 1)].mean()} +/- {nnz_arr[:(it + 1)].std()}') - print(f'ntop(A*B) = {ntop_arr[:(it + 1)].mean()} +/- {ntop_arr[:(it + 1)].std()}') - print('') - df = df.astype({ - 'sample': np.int64, '#threads': np.int64, 'python': np.float64}) - results = df.groupby('#threads', as_index=True, sort=True)[['python']].mean() - - print(results) - print('') - print('') diff --git a/string_grouper_topn/example/comparison3.py b/string_grouper_topn/example/comparison3.py deleted file mode 100644 index de0984f4..00000000 --- a/string_grouper_topn/example/comparison3.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -This file compare our boosting method with calling scipy+numpy function directly -""" - -from __future__ import print_function -# import timeit -import time -import numpy as np -# import pandas as pd -from scipy.sparse import load_npz -from string_grouper_topn import awesome_cossim_topn # noqa: F401 - -a = load_npz('sparse_matrix_A.npz') -b = load_npz('sparse_matrix_B.npz') - -# tic = time.perf_counter() -# p = np.random.permutation(a.shape[0]) -# a = a[p] -# toc = time.perf_counter() -# print(f'shuffle(A) took {(toc - tic):0.4f} seconds', flush=True) - - -N = b.shape[1] -thresh = 0.8 - -nr_vocab = b.shape[0] -density_A = len(a.data)/(a.shape[0]*a.shape[1]) -density_B = len(b.data)/(b.shape[0]*b.shape[1]) -n_samples = a.shape[0] -n_duplicates = b.shape[1] -nnz_a = len(a.data) -nnz_b = len(b.data) - -print(f'ntop = {N}', flush=True) -print(f'threshold = {thresh}', flush=True) -print(f'density(A) = {density_A}', flush=True) -print(f'density(B) = {density_B}', flush=True) -print(f'nr_vocab = {nr_vocab}', flush=True) -print(f'n_samples = {n_samples}', flush=True) -print(f'n_duplicates = {n_duplicates}', flush=True) -print(f'nnz_A = {nnz_a}', flush=True) -print(f'nnz_B = {nnz_b}', flush=True) -print('', flush=True) - -n_matrix_pairs = 1 -nnz_arr = np.full(n_matrix_pairs, 0) -ntop_arr = np.full(n_matrix_pairs, 0) -r = 0 -it = 0 - -tic = time.perf_counter() -C, C_ntop = awesome_cossim_topn(a, b, N, thresh, use_threads=True, n_jobs=7, return_best_ntop=True) -toc = time.perf_counter() - -print('scout_nnz=True, use_threads=True, n_jobs = 7') -print(f'nnz(A*B) = {len(C.data)}', flush=True) -print(f'ntop(A*B) = {C_ntop}', flush=True) -print(f'duration(A*B) = {(toc - tic):0.4f}', flush=True) diff --git a/string_grouper_topn/example/example.py b/string_grouper_topn/example/example.py deleted file mode 100644 index a403d3ab..00000000 --- a/string_grouper_topn/example/example.py +++ /dev/null @@ -1,14 +0,0 @@ -from scipy.sparse import rand -from string_grouper_topn import awesome_cossim_topn - -N = 10 -a = rand(100, 1000000, density=0.005, format='csr') -b = rand(1000000, 200, density=0.005, format='csr') - -# Use standard implementation - -c = awesome_cossim_topn(a, b, 5, 0.01) - -# Use parallel implementation with 4 threads - -d = awesome_cossim_topn(a, b, 5, 0.01, use_threads=True, n_jobs=4) diff --git a/string_grouper_topn/sparse_dot_topn.pyx b/string_grouper_topn/sparse_dot_topn.pyx deleted file mode 100644 index 974b4ce9..00000000 --- a/string_grouper_topn/sparse_dot_topn.pyx +++ /dev/null @@ -1,261 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at# -# http://www.apache.org/licenses/LICENSE-2.0# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Author: Zhe Sun, Ahmet Erdem -# April 20, 2017 -# Modified by: Particular Miner -# April 14, 2021 - -# distutils: language = c++ - -from libcpp.vector cimport vector -from array_wrappers cimport ArrayWrapper_int, ArrayWrapper_double - -cimport numpy as np -import numpy as np - -np.import_array() - - -cdef extern from "sparse_dot_topn_source.h": - - cdef void sparse_dot_topn_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int topn, - double lower_bound, - int Cp[], - int Cj[], - double Cx[] - ); - - cdef int sparse_dot_topn_extd_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int topn, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], - vector[int]* alt_Cj, - vector[double]* alt_Cx, - int nnz_max, - int* nminmax - ); - - cdef int sparse_dot_only_nnz_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int ntop, - double lower_bound - ); - -cpdef sparse_dot_topn( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr, - np.ndarray[int, ndim=1] c_indices, - np.ndarray[double, ndim=1] c_data -): - """ - Cython glue function to call sparse_dot_topn C++ implementation - This function will return a matrix C in CSR format, where - C = [sorted top n results and results > lower_bound for each row of A * B] - - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix - - a_indptr, a_indices, a_data: CSR expression of A matrix - b_indptr, b_indices, b_data: CSR expression of B matrix - - ntop: n top results - lower_bound: a threshold that the element of A*B must greater than - - Output by reference: - c_indptr, c_indices, c_data: CSR expression of C matrix - - N.B. A and B must be CSR format!!! - The type of input numpy array must be aligned with types of C++ function arguments! - """ - - cdef int* Ap = &a_indptr[0] - cdef int* Aj = &a_indices[0] - cdef double* Ax = &a_data[0] - cdef int* Bp = &b_indptr[0] - cdef int* Bj = &b_indices[0] - cdef double* Bx = &b_data[0] - cdef int* Cp = &c_indptr[0] - cdef int* Cj = &c_indices[0] - cdef double* Cx = &c_data[0] - - sparse_dot_topn_source( - n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx - ) - return - -cpdef sparse_dot_topn_extd( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr, - np.ndarray[int, ndim=1] c_indices, - np.ndarray[double, ndim=1] c_data, - np.ndarray[int, ndim=1] nminmax -): - """ - Cython glue function to call sparse_dot_topn_extd C++ - implementation. This function will return a matrix C in CSR - format, where - C = [sorted top n results > lower_bound for each row of A * B] - The maximum number nminmax of elements per row of C (assuming - n = number of columns of B) is also returned. - - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix - - a_indptr, a_indices, a_data: CSR expression of A matrix - b_indptr, b_indices, b_data: CSR expression of B matrix - - ntop: n, the number of topmost results > lower_bound for - each row of C - lower_bound: a threshold that the element of A*B must - greater than - - Output by reference: - c_indptr, c_indices, c_data: CSR expression of matrix C - nminmax: The maximum number of elements per row of C - (assuming ntop = n_col) - - Returned output: - c_indices, c_data: CSR expression of matrix C. These will - be returned instead of output by reference - if the preset sizes of c_indices and - c_data are too small to hold all the - results. - - N.B. A and B must be CSR format!!! - The type of input numpy array must be aligned with types - of C++ function arguments! - """ - - cdef int* Ap = &a_indptr[0] - cdef int* Aj = &a_indices[0] - cdef double* Ax = &a_data[0] - cdef int* Bp = &b_indptr[0] - cdef int* Bj = &b_indices[0] - cdef double* Bx = &b_data[0] - cdef int* Cp = &c_indptr[0] - cdef int* Cj = &c_indices[0] - cdef double* Cx = &c_data[0] - cdef int* n_minmax = &nminmax[0] - - cdef nnz_max = len(c_indices) - - cdef vector[int] vCj; - cdef vector[double] vCx; - - cdef int nnz_max_is_too_small = sparse_dot_topn_extd_source( - n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, &vCj, &vCx, nnz_max, n_minmax - ) - - if nnz_max_is_too_small: - - c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0) - c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0) - - return c_indices, c_data - - else: - - return None, None - -cpdef sparse_dot_only_nnz( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound -): - """ - Cython glue function to call sparse_dot_nnz_only C++ implementation - This function will return nnz, the total number of nonzero - matrix-components of - C = [top n results > lower_bound for each row of A * B]. - - Input: - a_indptr, a_indices, a_data: CSR expression of A matrix - b_indptr, b_indices, b_data: CSR expression of B matrix - - ntop: n, the number of topmost results > lower_bound for - each row of C - lower_bound: a threshold that the element of A*B must - greater than - - Returned output: - nnz: the total number of nonzero matrix-components of C - - N.B. A and B must be CSR format!!! - The type of input numpy array must be aligned with types of C++ function arguments! - """ - - cdef int* Ap = &a_indptr[0] - cdef int* Aj = &a_indices[0] - cdef double* Ax = &a_data[0] - cdef int* Bp = &b_indptr[0] - cdef int* Bj = &b_indices[0] - cdef double* Bx = &b_data[0] - - return sparse_dot_only_nnz_source( - n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound - ) diff --git a/string_grouper_topn/sparse_dot_topn_parallel.cpp b/string_grouper_topn/sparse_dot_topn_parallel.cpp deleted file mode 100644 index 0efb7a45..00000000 --- a/string_grouper_topn/sparse_dot_topn_parallel.cpp +++ /dev/null @@ -1,571 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Author: Zhe Sun, Ahmet Erdem -// April 20, 2017 -// Modified by: Particular Miner -// April 14, 2021 - -#include -#include -#include -#include -#include - -#include "./sparse_dot_topn_source.h" -#include "./sparse_dot_topn_parallel.h" - - -struct job_range_type {int begin; int end;}; - -void distribute_load( - int load_sz, - int n_jobs, - std::vector &ranges -) -{ - // share the load among jobs: - int equal_job_load_sz = load_sz/n_jobs; - int rem = load_sz % n_jobs; - ranges.resize(n_jobs); - - int start = 0; - for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - - ranges[job_nr].begin = start; - ranges[job_nr].end = start + equal_job_load_sz + ((job_nr < rem)? 1 : 0); - start = ranges[job_nr].end; - } -} - -void inner_gather_v2( - job_range_type job_range, - int Cp[], - int Cp_start, - int Cj[], - double Cx[], - std::vector* real_candidates, - std::vector* row_nnz -) -{ - if (job_range.begin >= job_range.end) return; - - int* nnz_begin = row_nnz->data(); - int* nnz_end = nnz_begin + row_nnz->size(); - - int* Cp_begin = &Cp[job_range.begin + 1]; - - (*row_nnz)[0] += Cp_start; - std::partial_sum(nnz_begin, nnz_end, Cp_begin); - - candidate* c_begin = real_candidates->data(); - candidate* c_end = c_begin + real_candidates->size(); - - int* Cj_begin = &Cj[Cp_start]; - double* Cx_begin = &Cx[Cp_start]; - - std::transform(c_begin, c_end, Cj_begin, [](candidate c) -> int { return c.index; }); - std::transform(c_begin, c_end, Cx_begin, [](candidate c) -> double { return c.value; }); -} - -void inner_gather_v1( - job_range_type job_range, - int Cp[], - int Cp_start, - int vCj_start[], - double vCx_start[], - std::vector* real_candidates, - std::vector* row_nnz -) -{ - candidate* c = real_candidates->data(); - int* vCj_cursor = &vCj_start[Cp_start]; - double* vCx_cursor = &vCx_start[Cp_start]; - - int Cp_i = Cp_start; - int* row_nnz_ptr = row_nnz->data(); - - for (int i = job_range.begin; i < job_range.end; i++){ - for (int j = 0; j < (*row_nnz_ptr); j++){ - *(vCj_cursor++) = c->index; - *(vCx_cursor++) = (c++)->value; - } - Cp_i += *(row_nnz_ptr++); - Cp[i + 1] = Cp_i; - } -} - -void inner_sparse_dot_topn( - job_range_type job_range, - int n_col_inner, - int ntop_inner, - double lower_bound_inner, - int Ap_copy[], - int Aj_copy[], - double Ax_copy[], - int Bp_copy[], - int Bj_copy[], - double Bx_copy[], - std::vector* real_candidates, - std::vector* row_nnz, - int* total -) -{ - std::vector next(n_col_inner,-1); - std::vector sums(n_col_inner, 0); - - real_candidates->reserve(job_range.end - job_range.begin); - - row_nnz->resize(job_range.end - job_range.begin); - int* row_nnz_ptr = row_nnz->data(); - - for (int i = job_range.begin; i < job_range.end; i++){ - - int head = -2; - int length = 0; - size_t sz = real_candidates->size(); - - int jj_start = Ap_copy[i]; - int jj_end = Ap_copy[i+1]; - - for(int jj = jj_start; jj < jj_end; jj++){ - int j = Aj_copy[jj]; - double v = Ax_copy[jj]; //value of A in (i,j) - - int kk_start = Bp_copy[j]; - int kk_end = Bp_copy[j+1]; - for(int kk = kk_start; kk < kk_end; kk++){ - int k = Bj_copy[kk]; //kth column of B in row j - - sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i - - if(next[k] == -1){ - next[k] = head; //keep a linked list, every element points to the next column index - head = k; - length++; - } - } - } - - for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) - - if(sums[head] > lower_bound_inner){ //append the nonzero elements - candidate c; - c.index = head; - c.value = sums[head]; - real_candidates->push_back(c); - } - - int temp = head; - head = next[head]; //iterate over columns - - next[temp] = -1; //clear arrays - sums[temp] = 0; //clear arrays - } - - int len = (int) (real_candidates->size() - sz); - - candidate* candidate_arr_begin = real_candidates->data() + sz; - if (len > ntop_inner){ - std::partial_sort( - candidate_arr_begin, - candidate_arr_begin + ntop_inner, - candidate_arr_begin + len, - candidate_cmp - ); - len = ntop_inner; - } - else { - std::sort( - candidate_arr_begin, - candidate_arr_begin + len, - candidate_cmp - ); - } - - real_candidates->resize(sz + (size_t) len); - *(row_nnz_ptr++) = len; - (*total) += len; - } -} - -void sparse_dot_topn_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], //data of A - int Bp[], - int Bj[], - double Bx[], //data of B - int ntop, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], - int n_jobs -) -{ - std::vector job_ranges(n_jobs); - distribute_load(n_row, n_jobs, job_ranges); - - std::vector> real_candidates(n_jobs); - std::vector> row_nnz(n_jobs); - - // initialize aggregate: - std::vector sub_total(n_jobs, 0); - - std::vector thread_list(n_jobs); - for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - - thread_list[job_nr] = std::thread( - inner_sparse_dot_topn, - job_ranges[job_nr], - n_col, ntop, - lower_bound, - Ap, Aj, Ax, Bp, Bj, Bx, - &real_candidates[job_nr], - &row_nnz[job_nr], - &sub_total[job_nr] - ); - } - - for (int job_nr = 0; job_nr < n_jobs; job_nr++) - thread_list[job_nr].join(); - - // gather the results: - std::vector nnz_job_starts(n_jobs + 1); - nnz_job_starts[0] = 0; - partial_sum(sub_total.begin(), sub_total.end(), nnz_job_starts.begin() + 1); - - Cp[0] = 0; - for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - - thread_list[job_nr] = std::thread( - inner_gather_v1, - job_ranges[job_nr], - Cp, - nnz_job_starts[job_nr], - Cj, - Cx, - &real_candidates[job_nr], - &row_nnz[job_nr] - ); - } - - for (int job_nr = 0; job_nr < n_jobs; job_nr++) - thread_list[job_nr].join(); -} - -void inner_sparse_dot_topn_extd( - job_range_type job_range, - int n_col_inner, - int ntop_inner, - double lower_bound_inner, - int Ap_copy[], - int Aj_copy[], - double Ax_copy[], - int Bp_copy[], - int Bj_copy[], - double Bx_copy[], - std::vector* real_candidates, - std::vector* row_nnz, - int* total, - int* n_minmax, - int mem_sz_per_row -) -{ - std::vector next(n_col_inner,-1); - std::vector sums(n_col_inner, 0); - - real_candidates->reserve(mem_sz_per_row*(job_range.end - job_range.begin)); - - row_nnz->resize(job_range.end - job_range.begin); - int* row_nnz_ptr = row_nnz->data(); - - for(int i = job_range.begin; i < job_range.end; i++){ - - int head = -2; - int length = 0; - size_t sz = real_candidates->size(); - - int jj_start = Ap_copy[i]; - int jj_end = Ap_copy[i+1]; - - for(int jj = jj_start; jj < jj_end; jj++){ - int j = Aj_copy[jj]; - double v = Ax_copy[jj]; //value of A in (i,j) - - int kk_start = Bp_copy[j]; - int kk_end = Bp_copy[j+1]; - for(int kk = kk_start; kk < kk_end; kk++){ - int k = Bj_copy[kk]; //kth column of B in row j - - sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i - - if(next[k] == -1){ - next[k] = head; //keep a linked list, every element points to the next column index - head = k; - length++; - } - } - } - - for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) - - if(sums[head] > lower_bound_inner){ //append the nonzero elements - candidate c; - c.index = head; - c.value = sums[head]; - real_candidates->push_back(c); - } - - int temp = head; - head = next[head]; //iterate over columns - - next[temp] = -1; //clear arrays - sums[temp] = 0; //clear arrays - } - - int len = (int) (real_candidates->size() - sz); - *n_minmax = (len > *n_minmax)? len : *n_minmax; - - candidate* candidate_arr_begin = real_candidates->data() + sz; - if (len > ntop_inner){ - std::partial_sort( - candidate_arr_begin, - candidate_arr_begin + ntop_inner, - candidate_arr_begin + len, - candidate_cmp - ); - len = ntop_inner; - } - else { - std::sort( - candidate_arr_begin, - candidate_arr_begin + len, - candidate_cmp - ); - } - - real_candidates->resize(sz + (size_t) len); - *(row_nnz_ptr++) = len; - (*total) += len; - } -} - -int sparse_dot_topn_extd_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], //data of A - int Bp[], - int Bj[], - double Bx[], //data of B - int ntop, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], - std::vector* alt_Cj, - std::vector* alt_Cx, - int nnz_max, - int *n_minmax, - int n_jobs -) -{ - std::vector job_ranges(n_jobs); - distribute_load(n_row, n_jobs, job_ranges); - - std::vector> real_candidates(n_jobs); - std::vector> row_nnz(n_jobs); - - // initialize aggregates: - std::vector sub_total(n_jobs, 0); - std::vector split_n_minmax(n_jobs, 0); - - int mem_sz_per_row = std::max(1, (int) std::ceil(((double) nnz_max)/((double) n_row))); - - std::vector thread_list(n_jobs); - - for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - - thread_list[job_nr] = std::thread( - inner_sparse_dot_topn_extd, - job_ranges[job_nr], - n_col, ntop, - lower_bound, - Ap, Aj, Ax, Bp, Bj, Bx, - &real_candidates[job_nr], - &row_nnz[job_nr], - &sub_total[job_nr], - &split_n_minmax[job_nr], - mem_sz_per_row - ); - } - - for (int job_nr = 0; job_nr < n_jobs; job_nr++) - thread_list[job_nr].join(); - - // gather the results: - *n_minmax = *max_element(split_n_minmax.begin(), split_n_minmax.end()); - - std::vector nnz_job_starts(n_jobs + 1); - nnz_job_starts[0] = 0; - partial_sum(sub_total.begin(), sub_total.end(), nnz_job_starts.begin() + 1); - - int* Cj_container; - double* Cx_container; - - int total = nnz_job_starts.back(); - int nnz_max_is_too_small = (nnz_max < total); - - if (nnz_max_is_too_small) { - alt_Cj->resize(total); - alt_Cx->resize(total); - Cj_container = &((*alt_Cj)[0]); - Cx_container = &((*alt_Cx)[0]); - } - else { - Cj_container = Cj; - Cx_container = Cx; - } - - Cp[0] = 0; - for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - - thread_list[job_nr] = std::thread( - inner_gather_v1, - job_ranges[job_nr], - Cp, - nnz_job_starts[job_nr], - Cj_container, - Cx_container, - &real_candidates[job_nr], - &row_nnz[job_nr] - ); - } - - for (int job_nr = 0; job_nr < n_jobs; job_nr++) - thread_list[job_nr].join(); - - return nnz_max_is_too_small; -} - -void inner_sparse_nnz_only( - job_range_type job_range, - int n_col_inner, - int ntop_inner, - double lower_bound_inner, - int Ap_copy[], - int Aj_copy[], - double Ax_copy[], - int Bp_copy[], - int Bj_copy[], - double Bx_copy[], - int* nnz -) -{ - - std::vector next(n_col_inner,-1); - std::vector sums(n_col_inner, 0); - - for(int i = job_range.begin; i < job_range.end; i++){ - - int head = -2; - int length = 0; - int candidates_sz = 0; - - int jj_start = Ap_copy[i]; - int jj_end = Ap_copy[i + 1]; - - for(int jj = jj_start; jj < jj_end; jj++){ - int j = Aj_copy[jj]; - double v = Ax_copy[jj]; //value of A in (i,j) - - int kk_start = Bp_copy[j]; - int kk_end = Bp_copy[j + 1]; - for(int kk = kk_start; kk < kk_end; kk++){ - int k = Bj_copy[kk]; //kth column of B in row j - - sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i - - if(next[k] == -1){ - next[k] = head; //keep a linked list, every element points to the next column index - head = k; - length++; - } - } - } - - for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) - - if(sums[head] > lower_bound_inner) candidates_sz++; - - int temp = head; - head = next[head]; //iterate over columns - - next[temp] = -1; //clear arrays - sums[temp] = 0; //clear arrays - } - - if (candidates_sz > ntop_inner) candidates_sz = ntop_inner; - - (*nnz) += candidates_sz; - } -} - -int sparse_dot_only_nnz_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int ntop, - double lower_bound, - int n_jobs -) -{ - std::vector job_row_ranges(n_jobs); - distribute_load(n_row, n_jobs, job_row_ranges); - - std::vector split_nnz(n_jobs, 0); - std::vector thread_list(n_jobs); - - for (int job_nr = 0; job_nr < n_jobs; job_nr++) { - - thread_list[job_nr] = std::thread ( - inner_sparse_nnz_only, - job_row_ranges[job_nr], - n_col, - ntop, lower_bound, - Ap, Aj, Ax, Bp, Bj, Bx, - &split_nnz[job_nr] - ); - - } - - for (int job_nr = 0; job_nr < n_jobs; job_nr++) - thread_list[job_nr].join(); - - return std::accumulate(split_nnz.begin(), split_nnz.end(), (int) 0); -} - diff --git a/string_grouper_topn/sparse_dot_topn_parallel.h b/string_grouper_topn/sparse_dot_topn_parallel.h deleted file mode 100644 index 3aeb11e0..00000000 --- a/string_grouper_topn/sparse_dot_topn_parallel.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Author: Zhe Sun, Ahmet Erdem -// April 20, 2017 -// Modified by: Particular Miner -// April 14, 2021 - -#ifndef UTILS_CPPCLASS_H -#define UTILS_CPPCLASS_H - -extern void sparse_dot_topn_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int ntop, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], - int n_jobs -); - -extern int sparse_dot_topn_extd_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int ntop, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], - std::vector* alt_Cj, - std::vector* alt_Cx, - int nnz_max, - int* n_minmax, - int n_jobs -); - -extern int sparse_dot_only_nnz_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int ntop, - double lower_bound, - int n_jobs -); - -#endif //UTILS_CPPCLASS_H diff --git a/string_grouper_topn/sparse_dot_topn_source.cpp b/string_grouper_topn/sparse_dot_topn_source.cpp deleted file mode 100644 index be987495..00000000 --- a/string_grouper_topn/sparse_dot_topn_source.cpp +++ /dev/null @@ -1,446 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Author: Zhe Sun, Ahmet Erdem -// April 20, 2017 -// Modified by: Particular Miner -// April 14, 2021 - -#include -#include -#include - -#include "./sparse_dot_topn_source.h" - -bool candidate_cmp(candidate c_i, candidate c_j) { return (c_i.value > c_j.value); } - -/* - C++ implementation of sparse_dot_topn - - This function will return a matrix C in CSR format, where - C = [sorted top n results > lower_bound for each row of A * B] - - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix - - Ap, Aj, Ax: CSR expression of A matrix - Bp, Bj, Bx: CSR expression of B matrix - - ntop: n top results - lower_bound: a threshold that the element of A*B must greater than - - Output by reference: - Cp, Cj, Cx: CSR expression of C matrix - - N.B. A and B must be CSR format!!! -*/ -void sparse_dot_topn_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], //data of A - int Bp[], - int Bj[], - double Bx[], //data of B - int ntop, - double lower_bound, - int Cp[], - int Cj[], - double Cx[] -) -{ - std::vector next(n_col,-1); - std::vector sums(n_col, 0); - - std::vector candidates; - - int nnz = 0; - - Cp[0] = 0; - - for(int i = 0; i < n_row; i++){ - int head = -2; - int length = 0; - - int jj_start = Ap[i]; - int jj_end = Ap[i+1]; - for(int jj = jj_start; jj < jj_end; jj++){ - int j = Aj[jj]; - double v = Ax[jj]; //value of A in (i,j) - - int kk_start = Bp[j]; - int kk_end = Bp[j+1]; - for(int kk = kk_start; kk < kk_end; kk++){ - int k = Bj[kk]; //kth column of B in row j - - sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i - - if(next[k] == -1){ - next[k] = head; //keep a linked list, every element points to the next column index - head = k; - length++; - } - } - } - - for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) - - if(sums[head] > lower_bound){ //append the nonzero elements - candidate c; - c.index = head; - c.value = sums[head]; - candidates.push_back(c); - } - - int temp = head; - head = next[head]; //iterate over columns - - next[temp] = -1; //clear arrays - sums[temp] = 0; //clear arrays - } - - int len = (int)candidates.size(); - if (len > ntop){ - std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp); - len = ntop; - } else { - std::sort(candidates.begin(), candidates.end(), candidate_cmp); - } - - for(int a=0; a < len; a++){ - Cj[nnz] = candidates[a].index; - Cx[nnz] = candidates[a].value; - nnz++; - } - candidates.clear(); - - Cp[i+1] = nnz; - } -} - -/* - C++ implementation of sparse_dot_topn_extd_source - - This function will return a matrix C in CSR format, where - C = [sorted top n results > lower_bound for each row of A * B]. - The maximum number n_minmax of elements per row of C (assuming ntop = n_col) - is also returned. - - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix - - Ap, Aj, Ax: CSR expression of A matrix - Bp, Bj, Bx: CSR expression of B matrix - - ntop: n top results - lower_bound: a threshold that the element of A*B must greater than - nnz_max: the size of the memory allocated for the results Cj and Cx. If - nnz_max is found to be too small during the computation, then the - results will be placed in vectors alt_Cj and alt_Cx instead - - Output by reference: - Cp, Cj, Cx: CSR expression of C matrix - n_minmax: The maximum number of elements per row of C (assuming ntop = n_col) - alt_Cj, alt_Cx: CSR expression of C matrix as vectors. These will - contain the output only if nnz_max is found to be too small - - Returned output: - nnz_max_is_too_small: int 1 or 0 depending on whether nnz_max was found to be - too small or not respectively - N.B. A and B must be CSR format!!! -*/ -int sparse_dot_topn_extd_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], //data of A - int Bp[], - int Bj[], - double Bx[], //data of B - int ntop, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], //data of C - std::vector* alt_Cj, - std::vector* alt_Cx, - int nnz_max, - int* n_minmax -) -{ - std::vector next(n_col,-1); - std::vector sums(n_col, 0); - - std::vector candidates; - candidates.reserve(n_col); - - int nnz = 0; - int nnz_max_is_too_small = 0; - - Cp[0] = 0; - *n_minmax = 0; - - for(int i = 0; i < n_row; i++){ - int head = -2; - int length = 0; - - int jj_start = Ap[i]; - int jj_end = Ap[i+1]; - for(int jj = jj_start; jj < jj_end; jj++){ - int j = Aj[jj]; - double v = Ax[jj]; //value of A in (i,j) - - int kk_start = Bp[j]; - int kk_end = Bp[j+1]; - for(int kk = kk_start; kk < kk_end; kk++){ - int k = Bj[kk]; //kth column of B in row j - - sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i - - if(next[k] == -1){ - next[k] = head; //keep a linked list, every element points to the next column index - head = k; - length++; - } - } - } - - for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) - - if(sums[head] > lower_bound){ //append the nonzero elements - candidate c; - c.index = head; - c.value = sums[head]; - candidates.push_back(c); - } - - int temp = head; - head = next[head]; //iterate over columns - - next[temp] = -1; //clear arrays - sums[temp] = 0; //clear arrays - } - - int len = (int)candidates.size(); - *n_minmax = (len > *n_minmax)? len : *n_minmax; - if (len > ntop){ - std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp); - len = ntop; - } else { - std::sort(candidates.begin(), candidates.end(), candidate_cmp); - } - if (len + nnz > nnz_max){ - if (!nnz_max_is_too_small){ - nnz_max_is_too_small = true; - alt_Cj->resize(nnz); - alt_Cx->resize(nnz); - std::copy(Cj, Cj + nnz, alt_Cj->data()); - std::copy(Cx, Cx + nnz, alt_Cx->data()); - } - for(int a = 0; a < len; a++){ - alt_Cj->push_back(candidates[a].index); - alt_Cx->push_back(candidates[a].value); - nnz++; - } - } - else { - for(int a = 0; a < len; a++){ - Cj[nnz] = candidates[a].index; - Cx[nnz] = candidates[a].value; - nnz++; - } - } - candidates.clear(); - - Cp[i+1] = nnz; - } - return nnz_max_is_too_small; -} - -/* - C++ implementation of sparse_dot_nnz_source - - This function will return the number nnz of nonzero elements - of the matrix C in CSR format, where - C = [all results > lower_bound sorted for each row of A * B] - and ntop the maximum number of elements per row of C. - This function is designed primarily to help with memory management for - very large sparse matrices. - - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix - - Ap, Aj, Ax: CSR expression of A matrix - Bp, Bj, Bx: CSR expression of B matrix - - lower_bound: a threshold that the element of A*B must greater than - - Output: - nnz: number of nonzero elements of matrix C - ntop: maximum number of elements per row of C - - N.B. A and B must be CSR format!!! -*/ -void sparse_dot_nnz_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], //data of A - int Bp[], - int Bj[], - double Bx[], //data of B - double lower_bound, - int* nnz, - int* ntop -) -{ - std::vector next(n_col,-1); - std::vector sums(n_col, 0); - - *nnz = 0; - *ntop = 0; - - for(int i = 0; i < n_row; i++){ - int head = -2; - int length = 0; - - int jj_start = Ap[i]; - int jj_end = Ap[i+1]; - for(int jj = jj_start; jj < jj_end; jj++){ - int j = Aj[jj]; - double v = Ax[jj]; //value of A in (i,j) - - int kk_start = Bp[j]; - int kk_end = Bp[j+1]; - for(int kk = kk_start; kk < kk_end; kk++){ - int k = Bj[kk]; //kth column of B in row j - - sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i - - if(next[k] == -1){ - next[k] = head; //keep a linked list, every element points to the next column index - head = k; - length++; - } - } - } - - int nnz_k = 0; - for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) - - if(sums[head] > lower_bound) nnz_k++; //count this nonzero element in - - int temp = head; - head = next[head]; //iterate over columns - - next[temp] = -1; //clear arrays - sums[temp] = 0; //clear arrays - } - *ntop = (nnz_k > *ntop)? nnz_k : *ntop; - *nnz += nnz_k; - } -} - -/* - C++ implementation of sparse_dot_only_max_nnz_col_source - - This function will return nnz, the total number of nonzero - matrix-components of - C = [top n results > lower_bound for each row of A * B]. - - Input: - n_row: number of rows of A matrix - n_col: number of columns of B matrix - - Ap, Aj, Ax: CSR expression of A matrix - Bp, Bj, Bx: CSR expression of B matrix - - ntop: top n results - lower_bound: a threshold that the element of A*B must greater than - - Returned output: - nnz: the total number of nonzero matrix-components of C - - N.B. A and B must be CSR format!!! -*/ -int sparse_dot_only_nnz_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], //data of A - int Bp[], - int Bj[], - double Bx[], //data of B - int ntop, - double lower_bound -) -{ - std::vector next(n_col,-1); - std::vector sums(n_col, 0); - - int nnz = 0; - - for(int i = 0; i < n_row; i++){ - int head = -2; - int length = 0; - int candidates_sz = 0; - - int jj_start = Ap[i]; - int jj_end = Ap[i+1]; - for(int jj = jj_start; jj < jj_end; jj++){ - int j = Aj[jj]; - double v = Ax[jj]; //value of A in (i,j) - - int kk_start = Bp[j]; - int kk_end = Bp[j+1]; - for(int kk = kk_start; kk < kk_end; kk++){ - int k = Bj[kk]; //kth column of B in row j - - sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i - - if(next[k] == -1){ - next[k] = head; //keep a linked list, every element points to the next column index - head = k; - length++; - } - } - } - - for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s) - - if(sums[head] > lower_bound) candidates_sz++; - - int temp = head; - head = next[head]; //iterate over columns - - next[temp] = -1; //clear arrays - sums[temp] = 0; //clear arrays - } - - if (candidates_sz > ntop) candidates_sz = ntop; - - nnz += candidates_sz; - } - return nnz; -} diff --git a/string_grouper_topn/sparse_dot_topn_source.h b/string_grouper_topn/sparse_dot_topn_source.h deleted file mode 100644 index 0ac85127..00000000 --- a/string_grouper_topn/sparse_dot_topn_source.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Author: Zhe Sun, Ahmet Erdem -// April 20, 2017 -// Modified by: Particular Miner -// April 14, 2021 - -#ifndef UTILS_CPPCLASS_H -#define UTILS_CPPCLASS_H - - -struct candidate {int index; double value;}; - -extern bool candidate_cmp(candidate c_i, candidate c_j); - -extern void sparse_dot_topn_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], //data of A - int Bp[], - int Bj[], - double Bx[], //data of B - int ntop, - double lower_bound, - int Cp[], - int Cj[], - double Cx[] //data of C -); - -extern int sparse_dot_topn_extd_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], //data of A - int Bp[], - int Bj[], - double Bx[], //data of B - int ntop, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], //data of C - std::vector* alt_Cj, - std::vector* alt_Cx, - int nnz_max, - int* n_minmax -); - -extern int sparse_dot_only_nnz_source( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], //data of A - int Bp[], - int Bj[], - double Bx[], //data of B - int ntop, - double lower_bound -); - -#endif //UTILS_CPPCLASS_H diff --git a/string_grouper_topn/sparse_dot_topn_threaded.pyx b/string_grouper_topn/sparse_dot_topn_threaded.pyx deleted file mode 100644 index e20aaaaf..00000000 --- a/string_grouper_topn/sparse_dot_topn_threaded.pyx +++ /dev/null @@ -1,190 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at# -# http://www.apache.org/licenses/LICENSE-2.0# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Author: Zhe Sun, Ahmet Erdem -# April 20, 2017 -# Modified by: Particular Miner -# April 14, 2021 - -# distutils: language = c++ - -from libcpp.vector cimport vector -from array_wrappers cimport ArrayWrapper_int, ArrayWrapper_double - -cimport numpy as np -import numpy as np - - -np.import_array() - - -cdef extern from "sparse_dot_topn_parallel.h": - - cdef void sparse_dot_topn_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int topn, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], - int n_jobs - ); - - cdef int sparse_dot_topn_extd_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int topn, - double lower_bound, - int Cp[], - int Cj[], - double Cx[], - vector[int]* alt_Cj, - vector[double]* alt_Cx, - int nnz_max, - int* n_minmax, - int n_jobs - ); - - cdef int sparse_dot_only_nnz_parallel( - int n_row, - int n_col, - int Ap[], - int Aj[], - double Ax[], - int Bp[], - int Bj[], - double Bx[], - int ntop, - double lower_bound, - int n_jobs - ); - -cpdef sparse_dot_topn_threaded( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr, - np.ndarray[int, ndim=1] c_indices, - np.ndarray[double, ndim=1] c_data, - int n_jobs -): - - cdef int* Ap = &a_indptr[0] - cdef int* Aj = &a_indices[0] - cdef double* Ax = &a_data[0] - cdef int* Bp = &b_indptr[0] - cdef int* Bj = &b_indices[0] - cdef double* Bx = &b_data[0] - cdef int* Cp = &c_indptr[0] - cdef int* Cj = &c_indices[0] - cdef double* Cx = &c_data[0] - - sparse_dot_topn_parallel( - n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_jobs - ) - return - -cpdef sparse_dot_topn_extd_threaded( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound, - np.ndarray[int, ndim=1] c_indptr, - np.ndarray[int, ndim=1] c_indices, - np.ndarray[double, ndim=1] c_data, - np.ndarray[int, ndim=1] nminmax, - int n_jobs -): - - cdef int* Ap = &a_indptr[0] - cdef int* Aj = &a_indices[0] - cdef double* Ax = &a_data[0] - cdef int* Bp = &b_indptr[0] - cdef int* Bj = &b_indices[0] - cdef double* Bx = &b_data[0] - cdef int* Cp = &c_indptr[0] - cdef int* Cj = &c_indices[0] - cdef double* Cx = &c_data[0] - cdef int* n_minmax = &nminmax[0] - - cdef nnz_max = len(c_indices) - - cdef vector[int] vCj; - cdef vector[double] vCx; - - cdef int nnz_max_is_too_small = sparse_dot_topn_extd_parallel( - n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, &vCj, &vCx, nnz_max, n_minmax, n_jobs - ) - - if nnz_max_is_too_small: - - c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0) - c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0) - - return c_indices, c_data - - else: - - return None, None - -cpdef sparse_dot_only_nnz_threaded( - int n_row, - int n_col, - np.ndarray[int, ndim=1] a_indptr, - np.ndarray[int, ndim=1] a_indices, - np.ndarray[double, ndim=1] a_data, - np.ndarray[int, ndim=1] b_indptr, - np.ndarray[int, ndim=1] b_indices, - np.ndarray[double, ndim=1] b_data, - int ntop, - double lower_bound, - int n_jobs -): - - cdef int* Ap = &a_indptr[0] - cdef int* Aj = &a_indices[0] - cdef double* Ax = &a_data[0] - cdef int* Bp = &b_indptr[0] - cdef int* Bj = &b_indices[0] - cdef double* Bx = &b_data[0] - - return sparse_dot_only_nnz_parallel( - n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, n_jobs - ) diff --git a/string_grouper_topn/test/test_awesome_cossim_topn.py b/string_grouper_topn/test/test_awesome_cossim_topn.py deleted file mode 100644 index ffb17915..00000000 --- a/string_grouper_topn/test/test_awesome_cossim_topn.py +++ /dev/null @@ -1,313 +0,0 @@ -# -*- coding: utf-8 -*- - -from string_grouper_topn import awesome_cossim_topn -from scipy.sparse.csr import csr_matrix -from scipy.sparse import coo_matrix -from scipy.sparse import rand -import numpy as np -import pandas as pd -import multiprocessing -import pytest - -PRUNE_THRESHOLD = 0.1 -NUM_CANDIDATES = 3 -USE_THREADS = True -MAX_N_PROCESSES = min(8, multiprocessing.cpu_count()) - 1 - - -def get_n_top_sparse(mat, n_top=10): - """ - Get list of (index, value) of the n largest elements in a 1-dimensional sparse matrix - - :param mat: input sparse matrix - :param n_top: number of largest elements, default is 10. - :return: sorted list of largest elements - """ - length = mat.getnnz() - if length == 0: - return None - if length <= n_top: - result = list(zip(mat.indices, mat.data)) - else: - arg_idx = np.argpartition(mat.data, -n_top)[-n_top:] - result = list(zip(mat.indices[arg_idx], mat.data[arg_idx])) - return sorted(result, key=lambda x: -x[1]) - - -def helper_awesome_cossim_topn_dense( - a_dense, - b_dense, - use_threads=False, - n_jobs=1 - ): - dense_result = np.dot(a_dense, np.transpose(b_dense)) # dot product - sparse_result = csr_matrix(dense_result) - sparse_result_top3 = [get_n_top_sparse(row, NUM_CANDIDATES) - for row in sparse_result] # get ntop using the old method - - pruned_dense_result = dense_result.copy() - pruned_dense_result[pruned_dense_result < PRUNE_THRESHOLD] = 0 # prune low similarity - pruned_sparse_result = csr_matrix(pruned_dense_result) - pruned_sparse_result_top3 = [get_n_top_sparse(row, NUM_CANDIDATES) for row in pruned_sparse_result] - - a_csr = csr_matrix(a_dense) - b_csr_t = csr_matrix(b_dense).T - - awesome_result = awesome_cossim_topn( - a_csr, b_csr_t, len(b_dense), - 0.0, - use_threads=use_threads, - n_jobs=n_jobs - ) - awesome_result_top3 = awesome_cossim_topn( - a_csr, - b_csr_t, - NUM_CANDIDATES, - 0.0, - use_threads=use_threads, - n_jobs=n_jobs - ) - awesome_result_top3 = [list(zip(row.indices, row.data)) if len( - row.data) > 0 else None for row in awesome_result_top3] # make comparable, normally not needed - - pruned_awesome_result = awesome_cossim_topn( - a_csr, - b_csr_t, - len(b_dense), - PRUNE_THRESHOLD, - use_threads=use_threads, - n_jobs=n_jobs - ) - pruned_awesome_result_top3 = awesome_cossim_topn( - a_csr, - b_csr_t, - NUM_CANDIDATES, - PRUNE_THRESHOLD, - use_threads=use_threads, - n_jobs=n_jobs - ) - pruned_awesome_result_top3 = [list(zip(row.indices, row.data)) if len( - row.data) > 0 else None for row in pruned_awesome_result_top3] - - # no candidate selection, no pruning - assert awesome_result.nnz == sparse_result.nnz - # no candidate selection, below PRUNE_THRESHOLD similarity pruned - assert pruned_awesome_result.nnz == pruned_sparse_result.nnz - - all_none1 = np.all(pd.isnull(awesome_result_top3)) and np.all(pd.isnull(sparse_result_top3)) - all_none2 = np.all(pd.isnull(pruned_awesome_result_top3)) and np.all(pd.isnull(pruned_sparse_result_top3)) - - # top NUM_CANDIDATES candidates selected, no pruning - if not all_none1: - np.testing.assert_array_almost_equal(awesome_result_top3, sparse_result_top3) - else: - assert len(awesome_result_top3) == len(sparse_result_top3) - # top NUM_CANDIDATES candidates selected, below PRUNE_THRESHOLD similarity pruned - if not all_none2: - np.testing.assert_array_almost_equal(pruned_awesome_result_top3, pruned_sparse_result_top3) - else: - assert len(pruned_awesome_result_top3) == len(pruned_sparse_result_top3) - - -def helper_awesome_cossim_topn_sparse( - a_sparse, - b_sparse, - flag=True, - use_threads=False, - n_jobs=1 - ): - # Note: helper function using awesome_cossim_topn - sparse_result = a_sparse.dot(b_sparse.T) # dot product - sparse_result_top3 = [get_n_top_sparse(row, NUM_CANDIDATES) - for row in sparse_result] # get ntop using the old method - - pruned_sparse_result = sparse_result.copy() - pruned_sparse_result[pruned_sparse_result < PRUNE_THRESHOLD] = 0 # prune low similarity - pruned_sparse_result.eliminate_zeros() - pruned_sparse_result_top3 = [get_n_top_sparse(row, NUM_CANDIDATES) for row in pruned_sparse_result] - - a_csr = csr_matrix(a_sparse) - b_csr_t = csr_matrix(b_sparse).T - - awesome_result = awesome_cossim_topn( - a_csr, - b_csr_t, - b_sparse.shape[0], - 0.0, - use_threads=use_threads, - n_jobs=n_jobs - ) - awesome_result_top3 = awesome_cossim_topn( - a_csr, - b_csr_t, - NUM_CANDIDATES, - 0.0, - use_threads=use_threads, - n_jobs=n_jobs - ) - awesome_result_top3 = [list(zip(row.indices, row.data)) if len( - row.data) > 0 else None for row in awesome_result_top3] # make comparable, normally not needed - - pruned_awesome_result = awesome_cossim_topn( - a_csr, - b_csr_t, - b_sparse.shape[0], - PRUNE_THRESHOLD, - use_threads=use_threads, - n_jobs=n_jobs - ) - pruned_awesome_result_top3 = awesome_cossim_topn( - a_csr, - b_csr_t, - NUM_CANDIDATES, - PRUNE_THRESHOLD, - use_threads=use_threads, - n_jobs=n_jobs - ) - pruned_awesome_result_top3 = [list(zip(row.indices, row.data)) if len( - row.data) > 0 else None for row in pruned_awesome_result_top3] - - # no candidate selection, no pruning - assert awesome_result.nnz == sparse_result.nnz - # no candidate selection, below PRUNE_THRESHOLD similarity pruned - assert pruned_awesome_result.nnz == pruned_sparse_result.nnz - - if flag: - all_none1 = np.all(pd.isnull(awesome_result_top3)) and np.all(pd.isnull(sparse_result_top3)) - all_none2 = np.all(pd.isnull(pruned_awesome_result_top3)) and np.all(pd.isnull(pruned_sparse_result_top3)) - - # top NUM_CANDIDATES candidates selected, no pruning - if not all_none1: - np.testing.assert_array_almost_equal(awesome_result_top3, sparse_result_top3) - else: - assert len(awesome_result_top3) == len(sparse_result_top3) - # top NUM_CANDIDATES candidates selected, below PRUNE_THRESHOLD similarity pruned - if not all_none2: - np.testing.assert_array_almost_equal(pruned_awesome_result_top3, pruned_sparse_result_top3) - else: - assert len(pruned_awesome_result_top3) == len(pruned_sparse_result_top3) - else: - assert awesome_result_top3 == sparse_result_top3 - assert pruned_awesome_result_top3 == pruned_sparse_result_top3 - - -def test_awesome_cossim_topn_manually(): - # a simple case - a_dense = [[0.2, 0.1, 0.0, 0.9, 0.3], - [0.7, 0.0, 0.0, 0.2, 0.2], - [0.0, 0.0, 0.0, 0.2, 0.1], - [0.5, 0.4, 0.5, 0.0, 0.0]] - - b_dense = [[0.4, 0.2, 0.3, 0.2, 0.7], - [0.9, 0.4, 0.5, 0.1, 0.4], - [0.3, 0.8, 0.0, 0.2, 0.5], - [0.3, 0.0, 0.1, 0.1, 0.6], - [0.6, 0.1, 0.2, 0.8, 0.1], - [0.9, 0.1, 0.6, 0.4, 0.3]] - helper_awesome_cossim_topn_dense(a_dense, b_dense) - for process in range(MAX_N_PROCESSES): - n_jobs = process + 1 - helper_awesome_cossim_topn_dense(a_dense, b_dense, use_threads=USE_THREADS, n_jobs=n_jobs) - - # boundary checking, there is no matching at all in this case - c_dense = [[0.2, 0.1, 0.3, 0, 0], - [0.7, 0.2, 0.7, 0, 0], - [0.3, 0.9, 0.6, 0, 0], - [0.5, 0.4, 0.5, 0, 0]] - d_dense = [[0, 0, 0, 0.6, 0.9], - [0, 0, 0, 0.1, 0.1], - [0, 0, 0, 0.2, 0.6], - [0, 0, 0, 0.8, 0.4], - [0, 0, 0, 0.1, 0.3], - [0, 0, 0, 0.7, 0.5]] - helper_awesome_cossim_topn_dense(c_dense, d_dense) - for process in range(MAX_N_PROCESSES): - n_jobs = process + 1 - helper_awesome_cossim_topn_dense(c_dense, d_dense, use_threads=USE_THREADS, n_jobs=n_jobs) - - -@pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero") -@pytest.mark.filterwarnings("ignore:Changing the sparsity structure of a csr_matrix is expensive") -def test_awesome_cossim_top_one_zeros(): - # test with one row matrix with all zeros - # helper_awesome_cossim_top_sparse uses a local function awesome_cossim_top - nr_vocab = 1000 - density = 0.1 - for _ in range(3): - a_sparse = csr_matrix(np.zeros((1, nr_vocab))) - b_sparse = rand(800, nr_vocab, density=density, format='csr') - helper_awesome_cossim_topn_sparse(a_sparse, b_sparse) - for process in range(MAX_N_PROCESSES): - n_jobs = process + 1 - helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, use_threads=USE_THREADS, n_jobs=n_jobs) - - -@pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero") -@pytest.mark.filterwarnings("ignore:Changing the sparsity structure of a csr_matrix is expensive") -def test_awesome_cossim_top_all_zeros(): - # test with all zeros matrix - # helper_awesome_cossim_top_sparse uses a local function awesome_cossim_top - nr_vocab = 1000 - density = 0.1 - for _ in range(3): - a_sparse = csr_matrix(np.zeros((2, nr_vocab))) - b_sparse = rand(800, nr_vocab, density=density, format='csr') - helper_awesome_cossim_topn_sparse(a_sparse, b_sparse) - for process in range(MAX_N_PROCESSES): - n_jobs = process + 1 - helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, use_threads=USE_THREADS, n_jobs=n_jobs) - - -@pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero") -@pytest.mark.filterwarnings("ignore:Changing the sparsity structure of a csr_matrix is expensive") -def test_awesome_cossim_top_small_matrix(): - # test with small matrix - nr_vocab = 1000 - density = 0.1 - for _ in range(10): - a_sparse = rand(300, nr_vocab, density=density, format='csr') - b_sparse = rand(800, nr_vocab, density=density, format='csr') - helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False) - for process in range(MAX_N_PROCESSES): - n_jobs = process + 1 - helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, use_threads=USE_THREADS, n_jobs=n_jobs) - - -@pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero") -@pytest.mark.filterwarnings("ignore:Changing the sparsity structure of a csr_matrix is expensive") -def test_awesome_cossim_top_large_matrix(): - # MB: I reduced the size of the matrix so the test also runs in small memory. - # test with large matrix - nr_vocab = 2 << 24 - density = 1e-6 - n_samples = 1000 - nnz = int(n_samples * nr_vocab * density) - - rng1 = np.random.RandomState(42) - rng2 = np.random.RandomState(43) - - for _ in range(1): - # scipy.sparse.rand has very high memory usage - # see for details: https://github.com/scipy/scipy/issues/9699 - # a_sparse = rand(500, nr_vocab, density=density, format='csr') - # b_sparse = rand(80000, nr_vocab, density=density, format='csr') - - # switching to alternative random method below, which is also a lot faster - row = rng1.randint(500, size=nnz) - cols = rng2.randint(nr_vocab, size=nnz) - data = rng1.rand(nnz) - - a_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab)) - a_sparse = a_sparse.tocsr() - - row = rng1.randint(n_samples, size=nnz) - cols = rng2.randint(nr_vocab, size=nnz) - data = rng1.rand(nnz) - - b_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab)) - b_sparse = b_sparse.tocsr() - - helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False) - for process in range(MAX_N_PROCESSES): - n_jobs = process + 1 - helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, use_threads=USE_THREADS, n_jobs=n_jobs) From bce1ce7548176f191b02a55f7bdb81256e051810 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Thu, 10 Jun 2021 16:01:45 +0200 Subject: [PATCH 25/29] restored dependency on upgraded package sparse_dot_topn --- string_grouper/string_grouper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index 7ebdaa82..7ef70c94 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -7,7 +7,7 @@ from scipy.sparse.lil import lil_matrix from scipy.sparse.csgraph import connected_components from typing import Tuple, NamedTuple, List, Optional, Union -from string_grouper_topn import awesome_cossim_topn +from sparse_dot_topn import awesome_cossim_topn from functools import wraps DEFAULT_NGRAM_SIZE: int = 3 From 3fd7329df9bddc0826491712450518f7251e67cd Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Thu, 10 Jun 2021 16:10:10 +0200 Subject: [PATCH 26/29] updated GitHub workflow action test script --- .github/workflows/test.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5317a62d..d69ee3c5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -21,11 +21,6 @@ jobs: with: python-version: ${{ matrix.python-version }} - - name: Install dev-package - run: | - python -m pip install --upgrade pip - pip install -v -e . - - name: Run tests run: | pip install pytest From 7742fe4730e314c494e347934754e7fefdb8241c Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Thu, 10 Jun 2021 22:09:13 +0200 Subject: [PATCH 27/29] updated dependency on latest version of sparse_dot_topn (v 0.3.1) --- .github/workflows/test.yml | 5 +++++ setup.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d69ee3c5..5317a62d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -21,6 +21,11 @@ jobs: with: python-version: ${{ matrix.python-version }} + - name: Install dev-package + run: | + python -m pip install --upgrade pip + pip install -v -e . + - name: Run tests run: | pip install pytest diff --git a/setup.py b/setup.py index f4b5ecb0..4b7dc00a 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,6 @@ , 'scipy' , 'scikit-learn' , 'numpy' - , 'sparse_dot_topn>=0.2.6' + , 'sparse_dot_topn>=0.3.1' ] ) From 36f731635ec17dea0dc2435d13151779b13fa606 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Fri, 11 Jun 2021 08:28:46 +0200 Subject: [PATCH 28/29] updated CHANGELOG.md --- .github/workflows/test.yml | 4 +--- CHANGELOG.md | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5317a62d..b29917e6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -27,6 +27,4 @@ jobs: pip install -v -e . - name: Run tests - run: | - pip install pytest - pytest -ra --capture=no --showlocals + run: python -m unittest diff --git a/CHANGELOG.md b/CHANGELOG.md index d1cb63ff..9ec3c325 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.4.1?] - 2021-06-11 + +### Added +[No additions were made] + +### Changed + +* Changed dependency on `sparse_dot_topn` from version 0.2.9 to 0.3.1 +* Changed the default value of the keyword argument `max_n_matches` from 20 to the number of strings in `duplicates` (or `master`, if +`duplicates` is not given). +* Changed warning issued when the condition \[`include_zeroes=True` and `min_similarity` ≤ 0 and `max_n_matches` is not sufficiently high to capture all nonzero-similarity-matches\] is met to an exception. + +### Removed + +* Removed the keyword argument `suppress_warning` + ## [0.4.0] - 2021-04-11 ### Added From 64e4f8597fe3d508e90a4fcecba0228bf8b224f3 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Fri, 11 Jun 2021 13:40:44 +0200 Subject: [PATCH 29/29] added new keyword argument tfidf_matrix_dtype (the datatype for the tf-idf values of the matrix components). Allowed values are np.float32 and np.float64 (used by sparse_dot_topn v0.3.1). Default is np.float32: np.float32 often leads to faster processing but less precision than np.float64 --- CHANGELOG.md | 7 ++++--- README.md | 1 + string_grouper/string_grouper.py | 16 +++++++++++++++- string_grouper/test/test_string_grouper.py | 7 +++++++ 4 files changed, 27 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ec3c325..ed25a1c0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,13 +10,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [0.4.1?] - 2021-06-11 ### Added -[No additions were made] + +* Added new keyword argument **`tfidf_matrix_dtype`** (the datatype for the tf-idf values of the matrix components). Allowed values are `numpy.float32` and `numpy.float64` (used by the required external package `sparse_dot_topn` version 0.3.1). Default is `numpy.float32`. (Note: `numpy.float32` often leads to faster processing and a smaller memory footprint albeit less numerical precision than `numpy.float64`.) ### Changed * Changed dependency on `sparse_dot_topn` from version 0.2.9 to 0.3.1 -* Changed the default value of the keyword argument `max_n_matches` from 20 to the number of strings in `duplicates` (or `master`, if -`duplicates` is not given). +* Changed the default datatype for cosine similarities from numpy.float64 to numpy.float32 to boost computational performance at the expense of numerical precision. +* Changed the default value of the keyword argument `max_n_matches` from 20 to the number of strings in `duplicates` (or `master`, if `duplicates` is not given). * Changed warning issued when the condition \[`include_zeroes=True` and `min_similarity` ≤ 0 and `max_n_matches` is not sufficiently high to capture all nonzero-similarity-matches\] is met to an exception. ### Removed diff --git a/README.md b/README.md index 6d391ead..1b18c3c9 100644 --- a/README.md +++ b/README.md @@ -134,6 +134,7 @@ All functions are built using a class **`StringGrouper`**. This class can be use All keyword arguments not mentioned in the function definitions above are used to update the default settings. The following optional arguments can be used: * **`ngram_size`**: The amount of characters in each n-gram. Default is `3`. + * **`tfidf_matrix_dtype`**: The datatype for the tf-idf values of the matrix components. Allowed values are `numpy.float32` and `numpy.float64`. Default is `numpy.float32`. (Note: `numpy.float32` often leads to faster processing and a smaller memory footprint albeit less numerical precision than `numpy.float64`.) * **`regex`**: The regex string used to clean-up the input string. Default is `"[,-./]|\s"`. * **`max_n_matches`**: The maximum number of matches allowed per string in `master`. Default is the number of strings in `duplicates` (or `master`, if `duplicates` is not given). * **`min_similarity`**: The minimum cosine similarity for two strings to be considered a match. diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index 7ef70c94..d1612511 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -11,6 +11,7 @@ from functools import wraps DEFAULT_NGRAM_SIZE: int = 3 +DEFAULT_TFIDF_MATRIX_DTYPE: type = np.float32 # (only types np.float32 and np.float64 are allowed by sparse_dot_topn) DEFAULT_REGEX: str = r'[,-./]|\s' DEFAULT_MAX_N_MATCHES: int = 20 DEFAULT_MIN_SIMILARITY: float = 0.8 # minimum cosine similarity for an item to be considered a match @@ -140,6 +141,10 @@ class StringGrouperConfig(NamedTuple): Class with configuration variables. :param ngram_size: int. The amount of characters in each n-gram. Default is 3. + :param tfidf_matrix_dtype: type. The datatype for the tf-idf values of the matrix components. + Possible values allowed by sparse_dot_topn are np.float32 and np.float64. Default is np.float32. + (Note: np.float32 often leads to faster processing and a smaller memory footprint albeit less precision + than np.float64.) :param regex: str. The regex string used to cleanup the input string. Default is '[,-./]|\s'. :param max_n_matches: int. The maximum number of matches allowed per string. Default is 20. :param min_similarity: float. The minimum cosine similarity for two strings to be considered a match. @@ -157,6 +162,7 @@ class StringGrouperConfig(NamedTuple): """ ngram_size: int = DEFAULT_NGRAM_SIZE + tfidf_matrix_dtype: int = DEFAULT_TFIDF_MATRIX_DTYPE regex: str = DEFAULT_REGEX max_n_matches: Optional[int] = None min_similarity: float = DEFAULT_MIN_SIMILARITY @@ -227,9 +233,10 @@ def __init__(self, master: pd.Series, self._max_n_matches = self._config.max_n_matches self._validate_group_rep_specs() + self._validate_tfidf_matrix_dtype() self._validate_replace_na_and_drop() self.is_build = False # indicates if the grouper was fit or not - self._vectorizer = TfidfVectorizer(min_df=1, analyzer=self.n_grams) + self._vectorizer = TfidfVectorizer(min_df=1, analyzer=self.n_grams, dtype=self._config.tfidf_matrix_dtype) # After the StringGrouper is built, _matches_list will contain the indices and similarities of the matches self._matches_list: pd.DataFrame = pd.DataFrame() # _true_max_n_matches will contain the true maximum number of matches over all strings in master if @@ -622,6 +629,13 @@ def _validate_group_rep_specs(self): f"Invalid option value for group_rep. The only permitted values are\n {group_rep_options}" ) + def _validate_tfidf_matrix_dtype(self): + dtype_options = (np.float32, np.float64) + if self._config.tfidf_matrix_dtype not in dtype_options: + raise Exception( + f"Invalid option value for tfidf_matrix_dtype. The only permitted values are\n {dtype_options}" + ) + def _validate_replace_na_and_drop(self): if self._config.ignore_index and self._config.replace_na: raise Exception("replace_na can only be set to True when ignore_index=False.") diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py index 2438d679..f5f0aac8 100644 --- a/string_grouper/test/test_string_grouper.py +++ b/string_grouper/test/test_string_grouper.py @@ -133,6 +133,7 @@ def test_compute_pairwise_similarities(self): ], name='similarity' ) + expected_result = expected_result.astype(np.float32) pd.testing.assert_series_equal(expected_result, similarities) def test_compute_pairwise_similarities_data_integrity(self): @@ -367,6 +368,7 @@ def test_build_matches_list(self): dupe_side = [0, 1] similarity = [1.0, 1.0] expected_df = pd.DataFrame({'master_side': master, 'dupe_side': dupe_side, 'similarity': similarity}) + expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype) pd.testing.assert_frame_equal(expected_df, sg._matches_list) def test_case_insensitive_build_matches_list(self): @@ -379,6 +381,7 @@ def test_case_insensitive_build_matches_list(self): dupe_side = [0, 1] similarity = [1.0, 1.0] expected_df = pd.DataFrame({'master_side': master, 'dupe_side': dupe_side, 'similarity': similarity}) + expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype) pd.testing.assert_frame_equal(expected_df, sg._matches_list) def test_get_matches_two_dataframes(self): @@ -393,6 +396,7 @@ def test_get_matches_two_dataframes(self): expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'similarity': similarity, 'right_side': right_side, 'right_index': right_index}) + expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype) pd.testing.assert_frame_equal(expected_df, sg.get_matches()) def test_get_matches_single(self): @@ -407,6 +411,7 @@ def test_get_matches_single(self): expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'similarity': similarity, 'right_side': right_side, 'right_index': right_index}) + expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype) pd.testing.assert_frame_equal(expected_df, sg.get_matches()) def test_get_matches_1_series_1_id_series(self): @@ -424,6 +429,7 @@ def test_get_matches_1_series_1_id_series(self): expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id, 'similarity': similarity, 'right_id': right_side_id, 'right_side': right_side, 'right_index': right_index}) + expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype) pd.testing.assert_frame_equal(expected_df, sg.get_matches()) def test_get_matches_2_series_2_id_series(self): @@ -443,6 +449,7 @@ def test_get_matches_2_series_2_id_series(self): expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id, 'similarity': similarity, 'right_id': right_side_id, 'right_side': right_side, 'right_index': right_index}) + expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype) pd.testing.assert_frame_equal(expected_df, sg.get_matches()) def test_get_matches_raises_exception_if_unexpected_options_given(self):