From eb5cdd9008011417acd3d30aae3aecf65fd68878 Mon Sep 17 00:00:00 2001
From: bergvca <cvandenberg>
Date: Sun, 11 Apr 2021 21:33:26 +0200
Subject: [PATCH 01/29] Added changelog with all changes since version 0.3.2

---
 CHANGELOG.md | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 CHANGELOG.md
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 00000000..d1cb63ff
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,31 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+
+## [0.4.0] - 2021-04-11
+
+### Added
+
+* Added group representative functionality - by default the centroid is used. From [@ParticularMiner](https://github.com/ParticularMiner)
+* Added string_grouper_utils package with additional group-representative functionality: 
+    * new_group_rep_by_earliest_timestamp
+    * new_group_rep_by_completeness
+    * new_group_rep_by_highest_weight
+
+    From [@ParticularMiner](https://github.com/ParticularMiner)    
+* Original indices are now added by default to output of `group_similar_strings`, `match_most_similar` and `match_strings`.
+  From [@ParticularMiner](https://github.com/ParticularMiner)
+* `compute_pairwise_similarities` function From [@ParticularMiner](https://github.com/ParticularMiner) 
+
+### Changed
+
+* Default group representative is now the centroid. Used to be the first string in the series belonging to a group.
+  From [@ParticularMiner](https://github.com/ParticularMiner)
+* Output of `match_most_similar` and `match_strings` is now a `pandas.DataFrame` object instead of a `pandas.Series`
+by default. From [@ParticularMiner](https://github.com/ParticularMiner)
+* Fixed a bug which occurs when min_similarity=0. From [@ParticularMiner](https://github.com/ParticularMiner)
\ No newline at end of file

From 5cb0066367228bba8c1f308c726d93e0728214ed Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Mon, 12 Apr 2021 21:34:34 +0200
Subject: [PATCH 02/29] added StringGrouper attribute function
 _get_true_max_n_matches() and

removed kwarg suppress_warning.  An error is now raised when
max_n_matches is too small.
---
 README.md                                  |  3 +-
 string_grouper/string_grouper.py           | 62 ++++++++++++++--------
 string_grouper/test/test_string_grouper.py |  6 +--
 3 files changed, 42 insertions(+), 29 deletions(-)

diff --git a/README.md b/README.md
index d1579a48..7faa5239 100644
--- a/README.md
+++ b/README.md
@@ -142,8 +142,7 @@ All functions are built using a class **<samp>StringGrouper</samp>**. This class
     `number of cores on a machine - 1.`
    * **<samp>ignore_index</samp>**: Determines whether indexes are ignored or not.  If `False` (the default), index-columns will appear in the output, otherwise not.  (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md) for a demonstration.)
    * **<samp>replace_na</samp>**: For function <samp>match_most_similar</samp>, determines whether `NaN` values in index-columns are replaced or not by index-labels from <samp>duplicates</samp>. Defaults to `False`.  (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md) for a demonstration.)
-   * **<samp>include_zeroes</samp>**: When <samp>min_similarity</samp> &le; 0, determines whether zero-similarity matches appear in the output.  Defaults to `True`.  (See [tutorials/zero_similarity.md](tutorials/zero_similarity.md) for a demonstration.)  **Warning:** Make sure the kwarg `max_n_matches` is sufficiently high to capture ***all*** nonzero-similarity-matches, otherwise some zero-similarity-matches returned will be false.
-   * **<samp>suppress_warning</samp>**: when <samp>min_similarity</samp> &le; 0 and <samp>include_zeroes</samp>  is `True`, determines whether or not to suppress the message warning that <samp>max_n_matches</samp> may be too small.  Defaults to `False`.
+   * **<samp>include_zeroes</samp>**: When <samp>min_similarity</samp> &le; 0, determines whether zero-similarity matches appear in the output.  Defaults to `True`.  (See [tutorials/zero_similarity.md](tutorials/zero_similarity.md).)  **Note:** If <samp>include_zeroes</samp> is `True` and the kwarg <samp>max_n_matches</samp> is set then it must be sufficiently high to capture ***all*** nonzero-similarity-matches, otherwise an error is raised and <samp>string_grouper</samp> suggests an alternative value for <samp>max_n_matches</samp>.  To allow <samp>string_grouper</samp> to automatically use the appropriate value for <samp>max_n_matches</samp> then do not set this kwarg at all.
    * **<samp>group_rep</samp>**: For function <samp>group_similar_strings</samp>, determines how group-representatives are chosen.  Allowed values are `'centroid'` (the default) and `'first'`.  See [tutorials/group_representatives.md](tutorials/group_representatives.md) for an explanation.
 
 ## Examples
diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index 8485f5c2..21ead95d 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -8,7 +8,6 @@
 from typing import Tuple, NamedTuple, List, Optional, Union
 from sparse_dot_topn import awesome_cossim_topn
 from functools import wraps
-import warnings
 
 DEFAULT_NGRAM_SIZE: int = 3
 DEFAULT_REGEX: str = r'[,-./]|\s'
@@ -21,9 +20,6 @@
                                     # similar string index-columns with corresponding duplicates-index values
 DEFAULT_INCLUDE_ZEROES: bool = True # when the minimum cosine similarity <=0, determines whether zero-similarity
                                     # matches appear in the output 
-DEFAULT_SUPPRESS_WARNING: bool = False  # when the minimum cosine similarity <=0 and zero-similarity matches are
-                                        # requested, determines whether or not to suppress the message warning that 
-                                        # max_n_matches may be too small 
 GROUP_REP_CENTROID: str = 'centroid'    # Option value to select the string in each group with the largest
                                         # similarity aggregate as group-representative:
 GROUP_REP_FIRST: str = 'first'  # Option value to select the first string in each group as group-representative:
@@ -153,8 +149,6 @@ class StringGrouperConfig(NamedTuple):
     :param ignore_index: whether or not to exclude string Series index-columns in output.  Defaults to False.
     :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches 
     appear in the output.  Defaults to True.
-    :param suppress_warning: when min_similarity <=0 and include_zeroes=True, determines whether or not to supress
-    the message warning that max_n_matches may be too small.  Defaults to False.
     :param replace_na: whether or not to replace NaN values in most similar string index-columns with 
     corresponding duplicates-index values. Defaults to False.
     :param group_rep: str.  The scheme to select the group-representative.  Default is 'centroid'.
@@ -163,13 +157,12 @@ class StringGrouperConfig(NamedTuple):
 
     ngram_size: int = DEFAULT_NGRAM_SIZE
     regex: str = DEFAULT_REGEX
-    max_n_matches: int = DEFAULT_MAX_N_MATCHES
+    max_n_matches: Optional[int] = None
     min_similarity: float = DEFAULT_MIN_SIMILARITY
     number_of_processes: int = DEFAULT_N_PROCESSES
     ignore_case: bool = DEFAULT_IGNORE_CASE
     ignore_index: bool = DEFAULT_DROP_INDEX
     include_zeroes: bool = DEFAULT_INCLUDE_ZEROES
-    suppress_warning: bool = DEFAULT_SUPPRESS_WARNING
     replace_na: bool = DEFAULT_REPLACE_NA
     group_rep: str = DEFAULT_GROUP_REP
 
@@ -226,12 +219,17 @@ def __init__(self, master: pd.Series,
         self._master_id: pd.Series = master_id if master_id is not None else None
         self._duplicates_id: pd.Series = duplicates_id if duplicates_id is not None else None
         self._config: StringGrouperConfig = StringGrouperConfig(**kwargs)
+        self._max_n_matches = DEFAULT_MAX_N_MATCHES if self._config.max_n_matches is None \
+            else self._config.max_n_matches
         self._validate_group_rep_specs()
         self._validate_replace_na_and_drop()
         self.is_build = False  # indicates if the grouper was fit or not
         self._vectorizer = TfidfVectorizer(min_df=1, analyzer=self.n_grams)
-        # After the StringGrouper is build, _matches_list will contain the indices and similarities of two matches
+        # After the StringGrouper is built, _matches_list will contain the indices and similarities of two matches
+        # and _true_max_n_matches will contain the true maximum number of matches over all strings in master if 
+        # self._config.min_similarity <= 0 
         self._matches_list: pd.DataFrame = pd.DataFrame()
+        self._true_max_n_matches = None
 
     def n_grams(self, string: str) -> List[str]:
         """
@@ -271,8 +269,7 @@ def dot(self) -> pd.Series:
     @validate_is_fit
     def get_matches(self,
                     ignore_index: Optional[bool] = None,
-                    include_zeroes: Optional[bool]=None,
-                    suppress_warning: Optional[bool]=None) -> pd.DataFrame:
+                    include_zeroes: Optional[bool]=None) -> pd.DataFrame:
         """
         Returns a DataFrame with all the matches and their cosine similarity.
         If optional IDs are used, returned as extra columns with IDs matched to respective data rows
@@ -281,8 +278,6 @@ def get_matches(self,
         self._config.ignore_index.
         :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches 
         appear in the output.  Defaults to self._config.include_zeroes.
-        :param suppress_warning: when min_similarity <=0 and include_zeroes=True, determines whether or not to suppress
-        the message warning that max_n_matches may be too small.  Defaults to self._config.suppress_warning.
         """
         def get_both_sides(master: pd.Series,
                            duplicates: pd.Series,
@@ -306,14 +301,13 @@ def prefix_column_names(data: Union[pd.Series, pd.DataFrame], prefix: str):
 
         if ignore_index is None: ignore_index = self._config.ignore_index
         if include_zeroes is None: include_zeroes = self._config.include_zeroes
-        if suppress_warning is None: suppress_warning = self._config.suppress_warning
         if self._config.min_similarity > 0 or not include_zeroes:
             matches_list = self._matches_list
         elif include_zeroes:
             # Here's a fix to a bug pointed out by one GitHub user (@nbcvijanovic):
             # the fix includes zero-similarity matches that are missing by default 
             # in _matches_list due to our use of sparse matrices 
-            non_matches_list = self._get_non_matches_list(suppress_warning)
+            non_matches_list = self._get_non_matches_list()
             matches_list = self._matches_list if non_matches_list.empty else \
                 pd.concat([self._matches_list, non_matches_list], axis=0, ignore_index=True)
             
@@ -437,6 +431,12 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix
         """Builds the cossine similarity matrix of two csr matrices"""
         tf_idf_matrix_1 = master_matrix
         tf_idf_matrix_2 = duplicate_matrix.transpose()
+        
+        # if min_similarity <= 0 compute the true maximum number of matches over all strings in master:
+        if self._config.min_similarity <= 0:
+            self._true_max_n_matches = StringGrouper._get_true_max_n_matches(tf_idf_matrix_1, tf_idf_matrix_2)
+            if self._config.max_n_matches is None:
+                self._max_n_matches = self._true_max_n_matches
 
         optional_kwargs = dict()
         if self._config.number_of_processes > 1:
@@ -446,7 +446,7 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix
             }
 
         return awesome_cossim_topn(tf_idf_matrix_1, tf_idf_matrix_2,
-                                   self._config.max_n_matches,
+                                   self._max_n_matches,
                                    self._config.min_similarity,
                                    **optional_kwargs)
 
@@ -462,23 +462,39 @@ def _symmetrize_matches_list(self):
                 ).set_index(['master_side', 'dupe_side'])
             ).reset_index()
 
-    def _get_non_matches_list(self, suppress_warning=False) -> pd.DataFrame:
+    def _get_non_matches_list(self) -> pd.DataFrame:
         """Returns a list of all the indices of non-matching pairs (with similarity set to 0)"""
         m_sz, d_sz = len(self._master), len(self._master if self._duplicates is None else self._duplicates)
         all_pairs = pd.MultiIndex.from_product([range(m_sz), range(d_sz)], names=['master_side', 'dupe_side'])
         matched_pairs = pd.MultiIndex.from_frame(self._matches_list[['master_side', 'dupe_side']])
         missing_pairs = all_pairs.difference(matched_pairs)
         if missing_pairs.empty: return pd.DataFrame()
-        if (self._config.max_n_matches < d_sz) and not suppress_warning:
-            warnings.warn(f'WARNING: max_n_matches={self._config.max_n_matches} may be too small!\n'
-                          f'\t\t Some zero-similarity matches returned may be false!\n'
-                          f'\t\t To be absolutely certain all zero-similarity matches are true,\n'
-                          f'\t\t try setting max_n_matches={d_sz} (the length of the Series parameter duplicates).\n'
-                          f'\t\t To suppress this warning, set suppress_warning=True.')
+        if (self._max_n_matches < self._true_max_n_matches):
+            raise Exception(f'\nERROR: Cannot return zero-similarity matches since \n'
+                            f'\t\t max_n_matches={self._max_n_matches} is too small!\n'
+                            f'\t\t Try setting max_n_matches={self._true_max_n_matches} (the \n'
+                            f'\t\t true maximum number of matches over all strings in master)\n'
+                            f'\t\t or greater or do not set this kwarg at all.')
         missing_pairs = missing_pairs.to_frame(index=False)
         missing_pairs['similarity'] = 0
         return missing_pairs
 
+    @staticmethod
+    def _get_true_max_n_matches(AA: csr_matrix, BB: csr_matrix) -> int:
+        """Returns the true maximum number of matches over all strings in master"""
+        def get_n_matches(i: int) -> int:
+            a_cols = A.indices[A.indptr[i]:A.indptr[i+1]]
+            nz = np.full(N, 0, dtype=int)
+            for j in a_cols:
+                nz[B.indices[B.indptr[j]:B.indptr[j+1]]] = 1
+            return np.sum(nz)
+        
+        A, B = AA.tocsr(), BB.tocsr()
+        M, _ = A.shape
+        _, N = B.shape
+        v = np.vectorize(get_n_matches)
+        return np.amax(v(range(M)))
+        
     @staticmethod
     def _get_matches_list(matches) -> pd.DataFrame:
         """Returns a list of all the indices of matches"""
diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py
index 723d3f22..cbc4bcae 100644
--- a/string_grouper/test/test_string_grouper.py
+++ b/string_grouper/test/test_string_grouper.py
@@ -9,7 +9,6 @@
     match_most_similar, group_similar_strings, match_strings,\
     compute_pairwise_similarities
 from unittest.mock import patch
-import warnings
 
 
 class SimpleExample(object):
@@ -93,7 +92,7 @@ def test_config_defaults(self):
         """Empty initialisation should set default values"""
         config = StringGrouperConfig()
         self.assertEqual(config.min_similarity, DEFAULT_MIN_SIMILARITY)
-        self.assertEqual(config.max_n_matches, DEFAULT_MAX_N_MATCHES)
+        self.assertEqual(config.max_n_matches, None)
         self.assertEqual(config.regex, DEFAULT_REGEX)
         self.assertEqual(config.ngram_size, DEFAULT_NGRAM_SIZE)
         self.assertEqual(config.number_of_processes, DEFAULT_N_PROCESSES)
@@ -253,7 +252,7 @@ def test_zero_min_similarity(self):
         simple_example = SimpleExample()
         s_master = simple_example.customers_df['Customer Name']
         s_dup = simple_example.whatever_series_1
-        matches = match_strings(s_master, s_dup, max_n_matches=len(s_master), min_similarity=0)
+        matches = match_strings(s_master, s_dup, min_similarity=0)
         pd.testing.assert_frame_equal(simple_example.expected_result_with_zeroes, matches)
 
     def test_zero_min_similarity_small_max_n_matches(self):
@@ -262,7 +261,6 @@ def test_zero_min_similarity_small_max_n_matches(self):
         simple_example = SimpleExample()
         s_master = simple_example.customers_df['Customer Name']
         s_dup = simple_example.two_strings
-        warnings.simplefilter('error', UserWarning)
         with self.assertRaises(Exception):
             _ = match_strings(s_master, s_dup, max_n_matches=1, min_similarity=0)
 

From f9f1868c15588ae43a84e2d1bb7289fad24c4eb9 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Wed, 14 Apr 2021 12:55:57 +0200
Subject: [PATCH 03/29] modified ing-bank's sparse_dot_topn to get
 n_max_matches true value and

made significant performance enhancements:
1. _symmetrize_matrix instead of _symmetrize_matches_list (boost x5)
2. _get_matches_list (boost x33)
---
 sparse_dot_topn/.gitignore                    |   4 +
 sparse_dot_topn/__init__.py                   |   7 +
 sparse_dot_topn/awesome_cossim_minmax_topn.py |  92 +++++
 sparse_dot_topn/awesome_cossim_topn.py        |  89 +++++
 sparse_dot_topn/sparse_dot_topn.pyx           | 160 ++++++++
 sparse_dot_topn/sparse_dot_topn_parallel.cpp  | 366 ++++++++++++++++++
 sparse_dot_topn/sparse_dot_topn_parallel.h    |  55 +++
 sparse_dot_topn/sparse_dot_topn_source.cpp    | 243 ++++++++++++
 sparse_dot_topn/sparse_dot_topn_source.h      |  57 +++
 sparse_dot_topn/sparse_dot_topn_threaded.pyx  | 120 ++++++
 string_grouper/string_grouper.py              |  65 ++--
 string_grouper/test/test_string_grouper.py    |  11 +-
 12 files changed, 1228 insertions(+), 41 deletions(-)
 create mode 100644 sparse_dot_topn/.gitignore
 create mode 100644 sparse_dot_topn/__init__.py
 create mode 100644 sparse_dot_topn/awesome_cossim_minmax_topn.py
 create mode 100644 sparse_dot_topn/awesome_cossim_topn.py
 create mode 100644 sparse_dot_topn/sparse_dot_topn.pyx
 create mode 100644 sparse_dot_topn/sparse_dot_topn_parallel.cpp
 create mode 100644 sparse_dot_topn/sparse_dot_topn_parallel.h
 create mode 100644 sparse_dot_topn/sparse_dot_topn_source.cpp
 create mode 100644 sparse_dot_topn/sparse_dot_topn_source.h
 create mode 100644 sparse_dot_topn/sparse_dot_topn_threaded.pyx

diff --git a/sparse_dot_topn/.gitignore b/sparse_dot_topn/.gitignore
new file mode 100644
index 00000000..d40e00a1
--- /dev/null
+++ b/sparse_dot_topn/.gitignore
@@ -0,0 +1,4 @@
+/sparse_dot_topn.cp39-win_amd64.pyd
+/sparse_dot_topn_threaded.cp39-win_amd64.pyd
+/sparse_dot_topn.cpp
+/sparse_dot_topn_threaded.cpp
diff --git a/sparse_dot_topn/__init__.py b/sparse_dot_topn/__init__.py
new file mode 100644
index 00000000..09d2bfa7
--- /dev/null
+++ b/sparse_dot_topn/__init__.py
@@ -0,0 +1,7 @@
+# flake8: noqa
+import sys
+
+if sys.version_info[0] >= 3:
+    from sparse_dot_topn.awesome_cossim_minmax_topn import awesome_cossim_minmax_topn
+else:
+    from awesome_cossim_minmax_topn import awesome_cossim_minmax_topn
\ No newline at end of file
diff --git a/sparse_dot_topn/awesome_cossim_minmax_topn.py b/sparse_dot_topn/awesome_cossim_minmax_topn.py
new file mode 100644
index 00000000..92fdf87f
--- /dev/null
+++ b/sparse_dot_topn/awesome_cossim_minmax_topn.py
@@ -0,0 +1,92 @@
+import sys
+import numpy as np
+from scipy.sparse import csr_matrix
+from scipy.sparse import isspmatrix_csr
+
+if sys.version_info[0] >= 3:
+    from sparse_dot_topn import sparse_dot_topn as ct
+    from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread
+# else:
+    # import sparse_dot_topn as ct
+    # import sparse_dot_topn_threaded as ct_thread
+
+
+def awesome_cossim_minmax_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1):
+    """
+    This function will return a matrxi C in CSR format, where
+    C = [sorted top n results and results > lower_bound for each row of A * B]
+
+    Input:
+        A and B: two CSR matrix
+        ntop: n top results
+        lower_bound: a threshold that the element of A*B must greater than
+        use_threads: use multi-thread or not
+        n_jobs: number of thread, must be >= 1
+
+    Output:
+        C: result matrix
+
+    N.B. if A and B are not CSR format, they will be converted to CSR
+    """
+    if not isspmatrix_csr(A):
+        A = A.tocsr()
+
+    if not isspmatrix_csr(B):
+        B = B.tocsr()
+
+    M, K1 = A.shape
+    K2, N = B.shape
+
+    if K1 != K2:
+        err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!'
+        raise ValueError(err_str)
+
+    idx_dtype = np.int32
+
+    nnz_max = M*ntop
+
+    # basic check. if A or B are all zeros matrix, return all zero matrix directly
+    if len(A.indices) == 0 or len(B.indices) == 0:
+        indptr = np.zeros(M + 1, dtype=idx_dtype)
+        indices = np.zeros(nnz_max, dtype=idx_dtype)
+        data = np.zeros(nnz_max, dtype=A.dtype)
+        return 0, csr_matrix((data, indices, indptr), shape=(M, N))
+
+    # filled matrices from here on
+    indptr = np.empty(M+1, dtype=idx_dtype)
+    indices = np.empty(nnz_max, dtype=idx_dtype)
+    data = np.empty(nnz_max, dtype=A.dtype)
+    
+    minmax_topn = np.full(1, 0, dtype=idx_dtype)
+
+    if not use_threads:
+
+        ct.sparse_dot_minmax_topn(
+            M, N, np.asarray(A.indptr, dtype=idx_dtype),
+            np.asarray(A.indices, dtype=idx_dtype),
+            A.data,
+            np.asarray(B.indptr, dtype=idx_dtype),
+            np.asarray(B.indices, dtype=idx_dtype),
+            B.data,
+            ntop,
+            lower_bound,
+            indptr, indices, data,
+            minmax_topn)
+
+    else:
+        if n_jobs < 1:
+            err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!'
+            raise ValueError(err_str)
+
+        ct_thread.sparse_dot_minmax_topn_threaded(
+            M, N, np.asarray(A.indptr, dtype=idx_dtype),
+            np.asarray(A.indices, dtype=idx_dtype),
+            A.data,
+            np.asarray(B.indptr, dtype=idx_dtype),
+            np.asarray(B.indices, dtype=idx_dtype),
+            B.data,
+            ntop,
+            lower_bound,
+            indptr, indices, data, minmax_topn, n_jobs)
+
+    return minmax_topn[0], csr_matrix((data, indices, indptr), shape=(M, N))
diff --git a/sparse_dot_topn/awesome_cossim_topn.py b/sparse_dot_topn/awesome_cossim_topn.py
new file mode 100644
index 00000000..c4af03d4
--- /dev/null
+++ b/sparse_dot_topn/awesome_cossim_topn.py
@@ -0,0 +1,89 @@
+import sys
+import numpy as np
+from scipy.sparse import csr_matrix
+from scipy.sparse import isspmatrix_csr
+
+if sys.version_info[0] >= 3:
+    from sparse_dot_topn import sparse_dot_topn as ct
+    from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread
+else:
+    import sparse_dot_topn as ct
+    import sparse_dot_topn_threaded as ct_thread
+
+
+def awesome_cossim_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1):
+    """
+    This function will return a matrxi C in CSR format, where
+    C = [sorted top n results and results > lower_bound for each row of A * B]
+
+    Input:
+        A and B: two CSR matrix
+        ntop: n top results
+        lower_bound: a threshold that the element of A*B must greater than
+        use_threads: use multi-thread or not
+        n_jobs: number of thread, must be >= 1
+
+    Output:
+        C: result matrix
+
+    N.B. if A and B are not CSR format, they will be converted to CSR
+    """
+    if not isspmatrix_csr(A):
+        A = A.tocsr()
+
+    if not isspmatrix_csr(B):
+        B = B.tocsr()
+
+    M, K1 = A.shape
+    K2, N = B.shape
+
+    if K1 != K2:
+        err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!'
+        raise ValueError(err_str)
+
+    idx_dtype = np.int32
+
+    nnz_max = M*ntop
+
+    # basic check. if A or B are all zeros matrix, return all zero matrix directly
+    if len(A.indices) == 0 or len(B.indices) == 0:
+        indptr = np.zeros(M + 1, dtype=idx_dtype)
+        indices = np.zeros(nnz_max, dtype=idx_dtype)
+        data = np.zeros(nnz_max, dtype=A.dtype)
+        return csr_matrix((data, indices, indptr), shape=(M, N))
+
+    # filled matrices from here on
+    indptr = np.empty(M+1, dtype=idx_dtype)
+    indices = np.empty(nnz_max, dtype=idx_dtype)
+    data = np.empty(nnz_max, dtype=A.dtype)
+
+    if not use_threads:
+
+        ct.sparse_dot_topn(
+            M, N, np.asarray(A.indptr, dtype=idx_dtype),
+            np.asarray(A.indices, dtype=idx_dtype),
+            A.data,
+            np.asarray(B.indptr, dtype=idx_dtype),
+            np.asarray(B.indices, dtype=idx_dtype),
+            B.data,
+            ntop,
+            lower_bound,
+            indptr, indices, data)
+
+    else:
+        if n_jobs < 1:
+            err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!'
+            raise ValueError(err_str)
+
+        ct_thread.sparse_dot_topn_threaded(
+            M, N, np.asarray(A.indptr, dtype=idx_dtype),
+            np.asarray(A.indices, dtype=idx_dtype),
+            A.data,
+            np.asarray(B.indptr, dtype=idx_dtype),
+            np.asarray(B.indices, dtype=idx_dtype),
+            B.data,
+            ntop,
+            lower_bound,
+            indptr, indices, data, n_jobs)
+
+    return csr_matrix((data, indices, indptr), shape=(M, N))
diff --git a/sparse_dot_topn/sparse_dot_topn.pyx b/sparse_dot_topn/sparse_dot_topn.pyx
new file mode 100644
index 00000000..1da3181a
--- /dev/null
+++ b/sparse_dot_topn/sparse_dot_topn.pyx
@@ -0,0 +1,160 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at#
+#    http://www.apache.org/licenses/LICENSE-2.0#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Author: Zhe Sun, Ahmet Erdem
+# April 20, 2017
+
+# distutils: language = c++
+
+import numpy as np
+cimport numpy as np
+
+cdef extern from "sparse_dot_topn_source.h":
+
+    cdef void sparse_dot_topn_source(
+                        int n_row,
+                        int n_col,
+                        int Ap[],
+                        int Aj[],
+                        double Ax[],
+                        int Bp[],
+                        int Bj[],
+                        double Bx[],
+                        int topn,
+                        double lower_bound,
+                        int Cp[],
+                        int Cj[],
+                        double Cx[]);
+
+    cdef void sparse_dot_minmax_topn_source(
+                        int n_row,
+                        int n_col,
+                        int Ap[],
+                        int Aj[],
+                        double Ax[],
+                        int Bp[],
+                        int Bj[],
+                        double Bx[],
+                        int topn,
+                        double lower_bound,
+                        int Cp[],
+                        int Cj[],
+                        double Cx[],
+                        int minmax_topn[]);
+
+cpdef sparse_dot_topn(
+        int n_row,
+        int n_col,
+        np.ndarray[int, ndim=1] a_indptr,
+        np.ndarray[int, ndim=1] a_indices,
+        np.ndarray[double, ndim=1] a_data,
+        np.ndarray[int, ndim=1] b_indptr,
+        np.ndarray[int, ndim=1] b_indices,
+        np.ndarray[double, ndim=1] b_data,
+        int ntop,
+        double lower_bound,
+        np.ndarray[int, ndim=1] c_indptr,
+        np.ndarray[int, ndim=1] c_indices,
+        np.ndarray[double, ndim=1] c_data
+    ):
+    """
+    Cython glue function to call sparse_dot_topn C++ implementation
+    This function will return a matrxi C in CSR format, where
+    C = [sorted top n results and results > lower_bound for each row of A * B]
+
+    Input:
+        n_row: number of rows of A matrix
+        n_col: number of columns of B matrix
+
+        a_indptr, a_indices, a_data: CSR expression of A matrix
+        b_indptr, b_indices, b_data: CSR expression of B matrix
+
+        ntop: n top results
+        lower_bound: a threshold that the element of A*B must greater than
+
+    Output by reference:
+        c_indptr, c_indices, c_data: CSR expression of C matrix
+
+    N.B. A and B must be CSR format!!!
+         The type of input numpy array must be aligned with types of C++ function aguments!
+    """
+
+    cdef int* Ap = &a_indptr[0]
+    cdef int* Aj = &a_indices[0]
+    cdef double* Ax = &a_data[0]
+    cdef int* Bp = &b_indptr[0]
+    cdef int* Bj = &b_indices[0]
+    cdef double* Bx = &b_data[0]
+    cdef int* Cp = &c_indptr[0]
+    cdef int* Cj = &c_indices[0]
+    cdef double* Cx = &c_data[0]
+
+    sparse_dot_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx)
+    return
+
+cpdef sparse_dot_minmax_topn(
+        int n_row,
+        int n_col,
+        np.ndarray[int, ndim=1] a_indptr,
+        np.ndarray[int, ndim=1] a_indices,
+        np.ndarray[double, ndim=1] a_data,
+        np.ndarray[int, ndim=1] b_indptr,
+        np.ndarray[int, ndim=1] b_indices,
+        np.ndarray[double, ndim=1] b_data,
+        int ntop,
+        double lower_bound,
+        np.ndarray[int, ndim=1] c_indptr,
+        np.ndarray[int, ndim=1] c_indices,
+        np.ndarray[double, ndim=1] c_data,
+        np.ndarray[int, ndim=1] minmax_topn
+    ):
+    """
+    Cython glue function to call sparse_dot_minmax_topn C++ implementation
+    This function will return a matrix C in CSR format, where
+    C = [sorted top n results > lower_bound for each row of A * B].
+    It also returns minmax_ntop (the maximum number of columns set
+    for each row of A * B when ntop is infinite)
+
+    Input:
+        n_row: number of rows of A matrix
+        n_col: number of columns of B matrix
+
+        a_indptr, a_indices, a_data: CSR expression of A matrix
+        b_indptr, b_indices, b_data: CSR expression of B matrix
+
+        ntop: n top results
+        lower_bound: a threshold that the element of A*B must greater than
+
+    Output by reference:
+        c_indptr, c_indices, c_data: CSR expression of C matrix
+        minmax_ntop: the maximum number of columns set for each row of
+                     A * B when ntop is infinite
+
+    N.B. A and B must be CSR format!!!
+         The type of input numpy array must be aligned with types of C++ function aguments!
+    """
+
+    cdef int* Ap = &a_indptr[0]
+    cdef int* Aj = &a_indices[0]
+    cdef double* Ax = &a_data[0]
+    cdef int* Bp = &b_indptr[0]
+    cdef int* Bj = &b_indices[0]
+    cdef double* Bx = &b_data[0]
+    cdef int* Cp = &c_indptr[0]
+    cdef int* Cj = &c_indices[0]
+    cdef double* Cx = &c_data[0]
+    cdef int* o_minmax_topn = &minmax_topn[0]
+
+    sparse_dot_minmax_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, o_minmax_topn)
+    return
\ No newline at end of file
diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/sparse_dot_topn/sparse_dot_topn_parallel.cpp
new file mode 100644
index 00000000..dc123b80
--- /dev/null
+++ b/sparse_dot_topn/sparse_dot_topn_parallel.cpp
@@ -0,0 +1,366 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Author: Zhe Sun, Ahmet Erdem
+// April 20, 2017
+
+#include <vector>
+#include <limits>
+#include <algorithm>
+#include <thread>
+
+#include "./sparse_dot_topn_source.h"
+#include "./sparse_dot_topn_parallel.h"
+
+void inner_sparse_function(int start_row, int end_row, int n_col_inner,
+                            int ntop_inner, double lower_bound_inner, int Ap_copy[],
+                            int Aj_copy[], double Ax_copy[], int Bp_copy[], int Bj_copy[],
+                            double Bx_copy[], std::vector<candidate> real_candidates[])
+{
+
+std::vector<int> next(n_col_inner,-1);
+std::vector<double> sums(n_col_inner, 0);
+
+std::vector<candidate> temp_candidates;
+
+int iterations_count = 0;
+
+for(int i = start_row; i < end_row; i++){
+
+    iterations_count += 1;
+
+    int head   = -2;
+    int length =  0;
+
+    int jj_start = Ap_copy[i];
+    int jj_end   = Ap_copy[i+1];
+
+    for(int jj = jj_start; jj < jj_end; jj++){
+        int j = Aj_copy[jj];
+        double v = Ax_copy[jj]; //value of A in (i,j)
+
+        int kk_start = Bp_copy[j];
+        int kk_end   = Bp_copy[j+1];
+        for(int kk = kk_start; kk < kk_end; kk++){
+            int k = Bj_copy[kk]; //kth column of B in row j
+
+            sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
+
+            if(next[k] == -1){
+                next[k] = head; //keep a linked list, every element points to the next column index
+                head  = k;
+                length++;
+            }
+        }
+    }
+
+    for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
+
+        if(sums[head] > lower_bound_inner){ //append the nonzero elements
+            candidate c;
+            c.index = head;
+            c.value = sums[head];
+            temp_candidates.push_back(c);
+        }
+
+        int temp = head;
+        head = next[head]; //iterate over columns
+
+        next[temp] = -1; //clear arrays
+        sums[temp] =  0; //clear arrays
+    }
+
+    int len = (int)temp_candidates.size();
+    if (len > ntop_inner){
+        std::partial_sort(temp_candidates.begin(),
+                            temp_candidates.begin()+ntop_inner,
+                            temp_candidates.end(),
+                            candidate_cmp);
+        len = ntop_inner;
+    } else {
+        std::sort(temp_candidates.begin(),
+                    temp_candidates.end(), candidate_cmp);
+    }
+
+
+    temp_candidates.resize(len);
+    real_candidates[i] = temp_candidates;
+
+    temp_candidates.clear();
+
+}
+
+}
+
+void sparse_dot_topn_parallel(int n_row,
+                        int n_col,
+                        int Ap[],
+                        int Aj[],
+                        double Ax[], //data of A
+                        int Bp[],
+                        int Bj[],
+                        double Bx[], //data of B
+                        int ntop,
+                        double lower_bound,
+                        int Cp[],
+                        int Cj[],
+                        double Cx[],
+                        int n_jobs)
+{
+
+    Cp[0] = 0;
+
+	int split_amount = n_row / n_jobs;
+
+	std::vector<std::vector<int>> split_row_vector(n_jobs);
+
+	std::vector<std::vector<candidate>> real_candidates(n_row);
+
+	std::vector<candidate> *real_cand_pointer;
+	real_cand_pointer = &real_candidates[0];
+
+	std::vector<std::thread> thread_list(n_jobs);
+
+
+	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
+	    std::vector<int> temp_vector(2, 0);
+
+	    int start_split = job_nr * split_amount;
+	    int end_split = start_split + split_amount;
+
+	    if (job_nr == n_jobs -1) {
+	        end_split = n_row;
+	    }
+
+	    temp_vector[0] = start_split;
+	    temp_vector[1] = end_split;
+
+	    split_row_vector[job_nr] = temp_vector;
+
+	}
+
+
+	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
+
+
+	    int start_row = split_row_vector[job_nr][0];
+	    int end_row = split_row_vector[job_nr][1];
+
+
+	    thread_list[job_nr] = std::thread (inner_sparse_function, start_row,
+	                                        end_row, n_col, ntop, lower_bound,
+	                                        Ap, Aj, Ax, Bp, Bj, Bx,
+	                                        real_cand_pointer);
+
+    }
+
+    for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
+        thread_list[job_nr].join();
+    }
+
+    int nnz = 0;
+
+    for (int m = 0; m < n_row; m++) {
+
+        std::vector<candidate> cand = real_cand_pointer[m];
+
+        int can_len = (int)cand.size();
+
+        for(int can_nr=0; can_nr < can_len; can_nr++){
+            Cj[nnz] = cand[can_nr].index;
+            Cx[nnz] = cand[can_nr].value;
+            nnz++;
+        }
+
+        Cp[m+1] = nnz;
+
+    }
+
+}
+
+void inner_sparse_minmax_function(int start_row, int end_row, int n_col_inner,
+                            int ntop_inner, double lower_bound_inner, int Ap_copy[],
+                            int Aj_copy[], double Ax_copy[], int Bp_copy[], int Bj_copy[],
+                            double Bx_copy[], std::vector<candidate> real_candidates[],
+							int *minmax_ntop)
+{
+
+std::vector<int> next(n_col_inner,-1);
+std::vector<double> sums(n_col_inner, 0);
+
+std::vector<candidate> temp_candidates;
+
+int iterations_count = 0;
+
+for(int i = start_row; i < end_row; i++){
+
+    iterations_count += 1;
+
+    int head   = -2;
+    int length =  0;
+
+    int jj_start = Ap_copy[i];
+    int jj_end   = Ap_copy[i+1];
+
+    for(int jj = jj_start; jj < jj_end; jj++){
+        int j = Aj_copy[jj];
+        double v = Ax_copy[jj]; //value of A in (i,j)
+
+        int kk_start = Bp_copy[j];
+        int kk_end   = Bp_copy[j+1];
+        for(int kk = kk_start; kk < kk_end; kk++){
+            int k = Bj_copy[kk]; //kth column of B in row j
+
+            sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
+
+            if(next[k] == -1){
+                next[k] = head; //keep a linked list, every element points to the next column index
+                head  = k;
+                length++;
+            }
+        }
+    }
+    *minmax_ntop = (length > *minmax_ntop)? length : *minmax_ntop;
+
+    for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
+
+        if(sums[head] > lower_bound_inner){ //append the nonzero elements
+            candidate c;
+            c.index = head;
+            c.value = sums[head];
+            temp_candidates.push_back(c);
+        }
+
+        int temp = head;
+        head = next[head]; //iterate over columns
+
+        next[temp] = -1; //clear arrays
+        sums[temp] =  0; //clear arrays
+    }
+
+    int len = (int)temp_candidates.size();
+    if (len > ntop_inner){
+        std::partial_sort(temp_candidates.begin(),
+                            temp_candidates.begin()+ntop_inner,
+                            temp_candidates.end(),
+                            candidate_cmp);
+        len = ntop_inner;
+    } else {
+        std::sort(temp_candidates.begin(),
+                    temp_candidates.end(), candidate_cmp);
+    }
+
+
+    temp_candidates.resize(len);
+    real_candidates[i] = temp_candidates;
+
+    temp_candidates.clear();
+
+}
+
+}
+
+void sparse_dot_minmax_topn_parallel(int n_row,
+                        int n_col,
+                        int Ap[],
+                        int Aj[],
+                        double Ax[], //data of A
+                        int Bp[],
+                        int Bj[],
+                        double Bx[], //data of B
+                        int ntop,
+                        double lower_bound,
+                        int Cp[],
+                        int Cj[],
+                        double Cx[],
+						int *minmax_ntop,
+                        int n_jobs)
+{
+
+    Cp[0] = 0;
+
+	int split_amount = n_row / n_jobs;
+
+	std::vector<std::vector<int>> split_row_vector(n_jobs);
+
+	std::vector<std::vector<candidate>> real_candidates(n_row);
+
+	std::vector<candidate> *real_cand_pointer;
+	real_cand_pointer = &real_candidates[0];
+
+    std::vector<int> split_minmax_ntop(n_jobs, 0);
+
+    std::vector<std::thread> thread_list(n_jobs);
+
+
+	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
+	    std::vector<int> temp_vector(2, 0);
+
+	    int start_split = job_nr * split_amount;
+	    int end_split = start_split + split_amount;
+
+	    if (job_nr == n_jobs -1) {
+	        end_split = n_row;
+	    }
+
+	    temp_vector[0] = start_split;
+	    temp_vector[1] = end_split;
+
+	    split_row_vector[job_nr] = temp_vector;
+
+	}
+
+
+	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
+
+
+	    int start_row = split_row_vector[job_nr][0];
+	    int end_row = split_row_vector[job_nr][1];
+
+
+	    thread_list[job_nr] = std::thread (inner_sparse_minmax_function, start_row,
+	                                        end_row, n_col, ntop, lower_bound,
+	                                        Ap, Aj, Ax, Bp, Bj, Bx,
+	                                        real_cand_pointer,
+											&split_minmax_ntop[job_nr]);
+
+    }
+
+    for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
+        thread_list[job_nr].join();
+    }
+
+    int nnz = 0;
+
+    for (int m = 0; m < n_row; m++) {
+
+        std::vector<candidate> cand = real_cand_pointer[m];
+
+        int can_len = (int)cand.size();
+
+        for(int can_nr=0; can_nr < can_len; can_nr++){
+            Cj[nnz] = cand[can_nr].index;
+            Cx[nnz] = cand[can_nr].value;
+            nnz++;
+        }
+
+        Cp[m+1] = nnz;
+
+    }
+    *minmax_ntop = *std::max_element(split_minmax_ntop.begin(), split_minmax_ntop.end());
+
+}
diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.h b/sparse_dot_topn/sparse_dot_topn_parallel.h
new file mode 100644
index 00000000..bd70b573
--- /dev/null
+++ b/sparse_dot_topn/sparse_dot_topn_parallel.h
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Author: Zhe Sun, Ahmet Erdem
+// April 20, 2017
+
+#ifndef UTILS_CPPCLASS_H
+#define UTILS_CPPCLASS_H
+
+extern void sparse_dot_topn_parallel(int n_row,
+      	              int n_col,
+      	              int Ap[],
+      	              int Aj[],
+      	              double Ax[],
+      	              int Bp[],
+      	              int Bj[],
+      	              double Bx[],
+                      int ntop,
+                      double lower_bound,
+      	                    int Cp[],
+      	                    int Cj[],
+      	                    double Cx[],
+      	                    int n_jobs);
+
+extern void sparse_dot_minmax_topn_parallel(int n_row,
+      	              int n_col,
+      	              int Ap[],
+      	              int Aj[],
+      	              double Ax[],
+      	              int Bp[],
+      	              int Bj[],
+      	              double Bx[],
+                      int ntop,
+                      double lower_bound,
+      	                    int Cp[],
+      	                    int Cj[],
+      	                    double Cx[],
+							int* minmax_topn,
+      	                    int n_jobs);
+
+#endif //UTILS_CPPCLASS_H
diff --git a/sparse_dot_topn/sparse_dot_topn_source.cpp b/sparse_dot_topn/sparse_dot_topn_source.cpp
new file mode 100644
index 00000000..e5cc3e12
--- /dev/null
+++ b/sparse_dot_topn/sparse_dot_topn_source.cpp
@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Author: Zhe Sun, Ahmet Erdem
+// April 20, 2017
+
+#include <vector>
+#include <limits>
+#include <algorithm>
+
+#include "./sparse_dot_topn_source.h"
+
+bool candidate_cmp(candidate c_i, candidate c_j) { return (c_i.value > c_j.value); }
+
+/*
+    C++ implementation of sparse_dot_topn
+
+    This function will return a matrxi C in CSR format, where
+    C = [sorted top n results > lower_bound for each row of A * B]
+
+    Input:
+        n_row: number of rows of A matrix
+        n_col: number of columns of B matrix
+
+        Ap, Aj, Ax: CSR expression of A matrix
+        Bp, Bj, Bx: CSR expression of B matrix
+
+        ntop: n top results
+        lower_bound: a threshold that the element of A*B must greater than
+
+    Output by reference:
+        Cp, Cj, Cx: CSR expression of C matrix
+
+    N.B. A and B must be CSR format!!!
+*/
+void sparse_dot_topn_source(int n_row,
+                        int n_col,
+                        int Ap[],
+                        int Aj[],
+                        double Ax[], //data of A
+                        int Bp[],
+                        int Bj[],
+                        double Bx[], //data of B
+                        int ntop,
+                        double lower_bound,
+                        int Cp[],
+                        int Cj[],
+                        double Cx[])
+{
+    std::vector<int> next(n_col,-1);
+    std::vector<double> sums(n_col, 0);
+
+    std::vector<candidate> candidates;
+
+    int nnz = 0;
+
+    Cp[0] = 0;
+
+    for(int i = 0; i < n_row; i++){
+        int head   = -2;
+        int length =  0;
+
+        int jj_start = Ap[i];
+        int jj_end   = Ap[i+1];
+        for(int jj = jj_start; jj < jj_end; jj++){
+            int j = Aj[jj];
+            double v = Ax[jj]; //value of A in (i,j)
+
+            int kk_start = Bp[j];
+            int kk_end   = Bp[j+1];
+            for(int kk = kk_start; kk < kk_end; kk++){
+                int k = Bj[kk]; //kth column of B in row j
+
+                sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
+
+                if(next[k] == -1){
+                    next[k] = head; //keep a linked list, every element points to the next column index
+                    head  = k;
+                    length++;
+                }
+            }
+        }
+
+        for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
+
+            if(sums[head] > lower_bound){ //append the nonzero elements
+                candidate c;
+                c.index = head;
+                c.value = sums[head];
+                candidates.push_back(c);
+            }
+
+            int temp = head;
+            head = next[head]; //iterate over columns
+
+            next[temp] = -1; //clear arrays
+            sums[temp] =  0; //clear arrays
+        }
+
+        int len = (int)candidates.size();
+        if (len > ntop){
+            std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp);
+            len = ntop;
+        } else {
+            std::sort(candidates.begin(), candidates.end(), candidate_cmp);
+        }
+
+        for(int a=0; a < len; a++){
+            Cj[nnz] = candidates[a].index;
+            Cx[nnz] = candidates[a].value;
+            nnz++;
+        }
+        candidates.clear();
+
+        Cp[i+1] = nnz;
+    }
+}
+
+/*
+    C++ implementation of sparse_dot_minmax_topn
+
+    This function will return a matrix C in CSR format, where
+    C = [sorted top n results > lower_bound for each row of A * B].
+    It also returns minmax_ntop (the maximum number of columns set
+    for each row of A * B when ntop is infinite)
+
+    Input:
+        n_row: number of rows of A matrix
+        n_col: number of columns of B matrix
+
+        Ap, Aj, Ax: CSR expression of A matrix
+        Bp, Bj, Bx: CSR expression of B matrix
+
+        ntop: n top results
+        lower_bound: a threshold that the element of A*B must greater than
+
+    Output by reference:
+        Cp, Cj, Cx: CSR expression of C matrix
+        minmax_ntop: the maximum number of columns set for each row of
+                     A * B when ntop is infinite
+
+    N.B. A and B must be CSR format!!!
+*/
+void sparse_dot_minmax_topn_source(int n_row,
+									int n_col,
+									int Ap[],
+									int Aj[],
+									double Ax[], //data of A
+									int Bp[],
+									int Bj[],
+									double Bx[], //data of B
+			                        int ntop,
+									double lower_bound,
+									int Cp[],
+									int Cj[],
+									double Cx[],
+									int *minmax_ntop)
+{
+    std::vector<int> next(n_col,-1);
+    std::vector<double> sums(n_col, 0);
+
+    std::vector<candidate> candidates;
+
+    int nnz = 0;
+
+    Cp[0] = 0;
+
+    *minmax_ntop = 0;
+
+    for(int i = 0; i < n_row; i++){
+        int head   = -2;
+        int length =  0;
+
+        int jj_start = Ap[i];
+        int jj_end   = Ap[i+1];
+        for(int jj = jj_start; jj < jj_end; jj++){
+            int j = Aj[jj];
+            double v = Ax[jj]; //value of A in (i,j)
+
+            int kk_start = Bp[j];
+            int kk_end   = Bp[j+1];
+            for(int kk = kk_start; kk < kk_end; kk++){
+                int k = Bj[kk]; //kth column of B in row j
+
+                sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
+
+                if(next[k] == -1){
+                    next[k] = head; //keep a linked list, every element points to the next column index
+                    head  = k;
+                    length++;
+                }
+            }
+        }
+        *minmax_ntop = (length > *minmax_ntop)? length : *minmax_ntop;
+
+        for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
+
+            if(sums[head] > lower_bound){ //append the nonzero elements
+                candidate c;
+                c.index = head;
+                c.value = sums[head];
+                candidates.push_back(c);
+            }
+
+            int temp = head;
+            head = next[head]; //iterate over columns
+
+            next[temp] = -1; //clear arrays
+            sums[temp] =  0; //clear arrays
+        }
+
+        int len = (int)candidates.size();
+        if (len > ntop){
+            std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp);
+            len = ntop;
+        } else {
+            std::sort(candidates.begin(), candidates.end(), candidate_cmp);
+        }
+
+        for(int a=0; a < len; a++){
+            Cj[nnz] = candidates[a].index;
+            Cx[nnz] = candidates[a].value;
+            nnz++;
+        }
+        candidates.clear();
+
+        Cp[i+1] = nnz;
+    }
+}
diff --git a/sparse_dot_topn/sparse_dot_topn_source.h b/sparse_dot_topn/sparse_dot_topn_source.h
new file mode 100644
index 00000000..d51de107
--- /dev/null
+++ b/sparse_dot_topn/sparse_dot_topn_source.h
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Author: Zhe Sun, Ahmet Erdem
+// April 20, 2017
+
+#ifndef UTILS_CPPCLASS_H
+#define UTILS_CPPCLASS_H
+
+struct candidate {int index; double value;};
+
+extern bool candidate_cmp(candidate c_i, candidate c_j);
+
+extern void sparse_dot_topn_source(int n_row,
+      	              int n_col,
+      	              int Ap[],
+      	              int Aj[],
+      	              double Ax[],
+      	              int Bp[],
+      	              int Bj[],
+      	              double Bx[],
+                      int ntop,
+                      double lower_bound,
+      	                    int Cp[],
+      	                    int Cj[],
+      	                    double Cx[]);
+
+extern void sparse_dot_minmax_topn_source(int n_row,
+										  int n_col,
+										  int Ap[],
+										  int Aj[],
+										  double Ax[],
+										  int Bp[],
+										  int Bj[],
+										  double Bx[],
+										  int ntop,
+										  double lower_bound,
+												int Cp[],
+												int Cj[],
+												double Cx[],
+												int *minmax_topn);
+
+#endif //UTILS_CPPCLASS_H
diff --git a/sparse_dot_topn/sparse_dot_topn_threaded.pyx b/sparse_dot_topn/sparse_dot_topn_threaded.pyx
new file mode 100644
index 00000000..1cef2229
--- /dev/null
+++ b/sparse_dot_topn/sparse_dot_topn_threaded.pyx
@@ -0,0 +1,120 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at#
+#    http://www.apache.org/licenses/LICENSE-2.0#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Author: Zhe Sun, Ahmet Erdem
+# April 20, 2017
+
+# distutils: language = c++
+
+import numpy as np
+cimport numpy as np
+
+cdef extern from "sparse_dot_topn_parallel.h":
+
+    cdef void sparse_dot_topn_parallel(
+                        int n_row,
+                        int n_col,
+                        int Ap[],
+                        int Aj[],
+                        double Ax[],
+                        int Bp[],
+                        int Bj[],
+                        double Bx[],
+                        int topn,
+                        double lower_bound,
+                        int Cp[],
+                        int Cj[],
+                        double Cx[],
+                        int n_jobs);
+
+    cdef void sparse_dot_minmax_topn_parallel(
+                        int n_row,
+                        int n_col,
+                        int Ap[],
+                        int Aj[],
+                        double Ax[],
+                        int Bp[],
+                        int Bj[],
+                        double Bx[],
+                        int topn,
+                        double lower_bound,
+                        int Cp[],
+                        int Cj[],
+                        double Cx[],
+                        int minmax_ntop[],
+                        int n_jobs);
+
+cpdef sparse_dot_topn_threaded(
+        int n_row,
+        int n_col,
+        np.ndarray[int, ndim=1] a_indptr,
+        np.ndarray[int, ndim=1] a_indices,
+        np.ndarray[double, ndim=1] a_data,
+        np.ndarray[int, ndim=1] b_indptr,
+        np.ndarray[int, ndim=1] b_indices,
+        np.ndarray[double, ndim=1] b_data,
+        int ntop,
+        double lower_bound,
+        np.ndarray[int, ndim=1] c_indptr,
+        np.ndarray[int, ndim=1] c_indices,
+        np.ndarray[double, ndim=1] c_data,
+        int n_jobs
+    ):
+
+    cdef int* Ap = &a_indptr[0]
+    cdef int* Aj = &a_indices[0]
+    cdef double* Ax = &a_data[0]
+    cdef int* Bp = &b_indptr[0]
+    cdef int* Bj = &b_indices[0]
+    cdef double* Bx = &b_data[0]
+    cdef int* Cp = &c_indptr[0]
+    cdef int* Cj = &c_indices[0]
+    cdef double* Cx = &c_data[0]
+
+    sparse_dot_topn_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop,
+                             lower_bound, Cp, Cj, Cx, n_jobs)
+    return
+
+cpdef sparse_dot_minmax_topn_threaded(
+        int n_row,
+        int n_col,
+        np.ndarray[int, ndim=1] a_indptr,
+        np.ndarray[int, ndim=1] a_indices,
+        np.ndarray[double, ndim=1] a_data,
+        np.ndarray[int, ndim=1] b_indptr,
+        np.ndarray[int, ndim=1] b_indices,
+        np.ndarray[double, ndim=1] b_data,
+        int ntop,
+        double lower_bound,
+        np.ndarray[int, ndim=1] c_indptr,
+        np.ndarray[int, ndim=1] c_indices,
+        np.ndarray[double, ndim=1] c_data,
+        np.ndarray[int, ndim=1] minmax_ntop,
+        int n_jobs
+    ):
+
+    cdef int* Ap = &a_indptr[0]
+    cdef int* Aj = &a_indices[0]
+    cdef double* Ax = &a_data[0]
+    cdef int* Bp = &b_indptr[0]
+    cdef int* Bj = &b_indices[0]
+    cdef double* Bx = &b_data[0]
+    cdef int* Cp = &c_indptr[0]
+    cdef int* Cj = &c_indices[0]
+    cdef double* Cx = &c_data[0]
+    cdef int* o_minmax_ntop = &minmax_ntop[0]
+
+    sparse_dot_minmax_topn_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop,
+                             lower_bound, Cp, Cj, Cx, o_minmax_ntop, n_jobs)
+    return
diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index 21ead95d..73689cf5 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -6,8 +6,9 @@
 from scipy.sparse.csr import csr_matrix
 from scipy.sparse.csgraph import connected_components
 from typing import Tuple, NamedTuple, List, Optional, Union
-from sparse_dot_topn import awesome_cossim_topn
+from sparse_dot_topn import awesome_cossim_minmax_topn
 from functools import wraps
+import time
 
 DEFAULT_NGRAM_SIZE: int = 3
 DEFAULT_REGEX: str = r'[,-./]|\s'
@@ -247,13 +248,15 @@ def n_grams(self, string: str) -> List[str]:
     def fit(self) -> 'StringGrouper':
         """Builds the _matches list which contains string matches indices and similarity"""
         master_matrix, duplicate_matrix = self._get_tf_idf_matrices()
+
         # Calculate the matches using the cosine similarity
-        matches = self._build_matches(master_matrix, duplicate_matrix)
-        # retrieve all matches
-        self._matches_list = self._get_matches_list(matches)
+        self._true_max_n_matches, matches = self._build_matches(master_matrix, duplicate_matrix)
         if self._duplicates is None:
             # the list of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A)
-            self._symmetrize_matches_list()
+            matches = StringGrouper._symmetrize_matrix(matches)
+
+        # build list from matrix
+        self._matches_list = self._get_matches_list(matches)
         self.is_build = True
         return self
 
@@ -434,7 +437,10 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix
         
         # if min_similarity <= 0 compute the true maximum number of matches over all strings in master:
         if self._config.min_similarity <= 0:
+            tic = time.perf_counter()
             self._true_max_n_matches = StringGrouper._get_true_max_n_matches(tf_idf_matrix_1, tf_idf_matrix_2)
+            toc = time.perf_counter()
+            print(f"1. _true_max_n_matches = {self._true_max_n_matches}; time: {toc - tic:0.4f} seconds", flush=True)
             if self._config.max_n_matches is None:
                 self._max_n_matches = self._true_max_n_matches
 
@@ -444,23 +450,14 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix
                 'use_threads': True,
                 'n_jobs': self._config.number_of_processes
             }
-
-        return awesome_cossim_topn(tf_idf_matrix_1, tf_idf_matrix_2,
+        tic = time.perf_counter()
+        tup = awesome_cossim_minmax_topn(tf_idf_matrix_1, tf_idf_matrix_2,
                                    self._max_n_matches,
                                    self._config.min_similarity,
                                    **optional_kwargs)
-
-    def _symmetrize_matches_list(self):
-        # [symmetrized matches_list] = [matches_list] UNION [transposed matches_list] (i.e., column-names swapped):
-        self._matches_list = self._matches_list.set_index(['master_side', 'dupe_side'])\
-            .combine_first(
-                self._matches_list.rename(
-                    columns={
-                        'master_side': 'dupe_side',
-                        'dupe_side': 'master_side'
-                    }
-                ).set_index(['master_side', 'dupe_side'])
-            ).reset_index()
+        toc = time.perf_counter()
+        print(f"2. _true_max_n_matches = {tup[0]}; time: {toc - tic:0.4f} seconds", flush=True)
+        return tup
 
     def _get_non_matches_list(self) -> pd.DataFrame:
         """Returns a list of all the indices of non-matching pairs (with similarity set to 0)"""
@@ -479,6 +476,13 @@ def _get_non_matches_list(self) -> pd.DataFrame:
         missing_pairs['similarity'] = 0
         return missing_pairs
 
+    @staticmethod
+    def _symmetrize_matrix(AA: csr_matrix) -> csr_matrix:
+        A = AA.tolil()
+        r, c = A.nonzero()
+        A[c, r] = A[r, c]
+        return A.tocsr()
+
     @staticmethod
     def _get_true_max_n_matches(AA: csr_matrix, BB: csr_matrix) -> int:
         """Returns the true maximum number of matches over all strings in master"""
@@ -496,25 +500,12 @@ def get_n_matches(i: int) -> int:
         return np.amax(v(range(M)))
         
     @staticmethod
-    def _get_matches_list(matches) -> pd.DataFrame:
+    def _get_matches_list(matches: csr_matrix) -> pd.DataFrame:
         """Returns a list of all the indices of matches"""
-        non_zeros = matches.nonzero()
-
-        sparserows = non_zeros[0]
-        sparsecols = non_zeros[1]
-        nr_matches = sparsecols.size
-        master_side = np.empty([nr_matches], dtype=int)
-        dupe_side = np.empty([nr_matches], dtype=int)
-        similarity = np.zeros(nr_matches)
-
-        for index in range(0, nr_matches):
-            master_side[index] = sparserows[index]
-            dupe_side[index] = sparsecols[index]
-            similarity[index] = matches.data[index]
-
-        matches_list = pd.DataFrame({'master_side': master_side,
-                                     'dupe_side': dupe_side,
-                                     'similarity': similarity})
+        r, c = matches.nonzero()
+        matches_list = pd.DataFrame({'master_side': r.astype(np.int64),
+                                     'dupe_side': c.astype(np.int64),
+                                     'similarity': matches.data})
         return matches_list
 
     def _get_nearest_matches(self,
diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py
index cbc4bcae..452273ac 100644
--- a/string_grouper/test/test_string_grouper.py
+++ b/string_grouper/test/test_string_grouper.py
@@ -9,7 +9,10 @@
     match_most_similar, group_similar_strings, match_strings,\
     compute_pairwise_similarities
 from unittest.mock import patch
+from scipy.sparse.csgraph._flow import csr_matrix
 
+def mock_symmetrize_matrix(A: csr_matrix) -> csr_matrix:
+    return A
 
 class SimpleExample(object):
     def __init__(self):
@@ -196,14 +199,14 @@ def test_match_strings(self, mock_StringGouper):
         mock_StringGrouper_instance.get_matches.assert_called_once()
         self.assertEqual(df, 'whatever')
 
-    @patch('string_grouper.string_grouper.StringGrouper._symmetrize_matches_list')
-    def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_matches_list):
+    @patch('string_grouper.string_grouper.StringGrouper._symmetrize_matrix', side_effect=mock_symmetrize_matrix)
+    def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_matrix):
         """mocks StringGrouper._symmetrize_matches_list so that this test fails whenever _matches_list is 
         **partially** symmetric which often occurs when the kwarg max_n_matches is too small"""
         simple_example = SimpleExample()
         df = simple_example.customers_df2['Customer Name']
         sg = StringGrouper(df, max_n_matches=2).fit()
-        mock_symmetrize_matches_list.assert_called_once()
+        mock_symmetrize_matrix.assert_called_once()
         # obtain the upper and lower triangular parts of the matrix of matches:
         upper = sg._matches_list[sg._matches_list['master_side'] < sg._matches_list['dupe_side']]
         lower = sg._matches_list[sg._matches_list['master_side'] > sg._matches_list['dupe_side']]
@@ -333,7 +336,7 @@ def test_build_matches(self):
         expected_matches = np.array([[1., 0., 0.],
                                      [0., 1., 0.],
                                      [0., 0., 0.]])
-        np.testing.assert_array_equal(expected_matches, sg._build_matches(master, dupe).toarray())
+        np.testing.assert_array_equal(expected_matches, sg._build_matches(master, dupe)[1].toarray())
 
     def test_build_matches_list(self):
         """Should create the cosine similarity matrix of two series"""

From 68a51a1c84da823c95e03f034eb0fa76417ba654 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Wed, 14 Apr 2021 18:02:45 +0200
Subject: [PATCH 04/29] made significant performance enhancements:

1. _symmetrize_matrix instead of _symmetrize_matches_list (boost x5)
2. _get_matches_list (boost x33)
3. awesome_cossim_true_minmax_topn_only (boost x43)
---
 sparse_dot_topn/.gitignore                    |  4 +-
 sparse_dot_topn/__init__.py                   |  4 +-
 sparse_dot_topn/awesome_cossim_minmax_topn.py | 92 -------------------
 sparse_dot_topn/awesome_cossim_topn.py        | 62 +++++++++++++
 sparse_dot_topn/sparse_dot_topn.pyx           | 65 +++++++++++--
 sparse_dot_topn/sparse_dot_topn_parallel.cpp  | 86 ++++++++++++++++-
 sparse_dot_topn/sparse_dot_topn_parallel.h    | 13 ++-
 sparse_dot_topn/sparse_dot_topn_source.cpp    | 68 ++++++++++++--
 sparse_dot_topn/sparse_dot_topn_source.h      | 64 +++++++------
 sparse_dot_topn/sparse_dot_topn_threaded.pyx  | 38 +++++++-
 string_grouper/string_grouper.py              | 60 +++++-------
 string_grouper/test/test_string_grouper.py    |  2 +-
 12 files changed, 376 insertions(+), 182 deletions(-)
 delete mode 100644 sparse_dot_topn/awesome_cossim_minmax_topn.py

diff --git a/sparse_dot_topn/.gitignore b/sparse_dot_topn/.gitignore
index d40e00a1..97caf501 100644
--- a/sparse_dot_topn/.gitignore
+++ b/sparse_dot_topn/.gitignore
@@ -1,4 +1,4 @@
+/sparse_dot_topn_threaded.cpp
 /sparse_dot_topn.cp39-win_amd64.pyd
-/sparse_dot_topn_threaded.cp39-win_amd64.pyd
 /sparse_dot_topn.cpp
-/sparse_dot_topn_threaded.cpp
+/sparse_dot_topn_threaded.cp39-win_amd64.pyd
diff --git a/sparse_dot_topn/__init__.py b/sparse_dot_topn/__init__.py
index 09d2bfa7..9cfee892 100644
--- a/sparse_dot_topn/__init__.py
+++ b/sparse_dot_topn/__init__.py
@@ -2,6 +2,6 @@
 import sys
 
 if sys.version_info[0] >= 3:
-    from sparse_dot_topn.awesome_cossim_minmax_topn import awesome_cossim_minmax_topn
+    from sparse_dot_topn.awesome_cossim_topn import awesome_cossim_topn, awesome_cossim_true_minmax_topn_only
 else:
-    from awesome_cossim_minmax_topn import awesome_cossim_minmax_topn
\ No newline at end of file
+    from awesome_cossim_topn import awesome_cossim_topn, awesome_cossim_true_minmax_topn_only
\ No newline at end of file
diff --git a/sparse_dot_topn/awesome_cossim_minmax_topn.py b/sparse_dot_topn/awesome_cossim_minmax_topn.py
deleted file mode 100644
index 92fdf87f..00000000
--- a/sparse_dot_topn/awesome_cossim_minmax_topn.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import sys
-import numpy as np
-from scipy.sparse import csr_matrix
-from scipy.sparse import isspmatrix_csr
-
-if sys.version_info[0] >= 3:
-    from sparse_dot_topn import sparse_dot_topn as ct
-    from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread
-# else:
-    # import sparse_dot_topn as ct
-    # import sparse_dot_topn_threaded as ct_thread
-
-
-def awesome_cossim_minmax_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1):
-    """
-    This function will return a matrxi C in CSR format, where
-    C = [sorted top n results and results > lower_bound for each row of A * B]
-
-    Input:
-        A and B: two CSR matrix
-        ntop: n top results
-        lower_bound: a threshold that the element of A*B must greater than
-        use_threads: use multi-thread or not
-        n_jobs: number of thread, must be >= 1
-
-    Output:
-        C: result matrix
-
-    N.B. if A and B are not CSR format, they will be converted to CSR
-    """
-    if not isspmatrix_csr(A):
-        A = A.tocsr()
-
-    if not isspmatrix_csr(B):
-        B = B.tocsr()
-
-    M, K1 = A.shape
-    K2, N = B.shape
-
-    if K1 != K2:
-        err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!'
-        raise ValueError(err_str)
-
-    idx_dtype = np.int32
-
-    nnz_max = M*ntop
-
-    # basic check. if A or B are all zeros matrix, return all zero matrix directly
-    if len(A.indices) == 0 or len(B.indices) == 0:
-        indptr = np.zeros(M + 1, dtype=idx_dtype)
-        indices = np.zeros(nnz_max, dtype=idx_dtype)
-        data = np.zeros(nnz_max, dtype=A.dtype)
-        return 0, csr_matrix((data, indices, indptr), shape=(M, N))
-
-    # filled matrices from here on
-    indptr = np.empty(M+1, dtype=idx_dtype)
-    indices = np.empty(nnz_max, dtype=idx_dtype)
-    data = np.empty(nnz_max, dtype=A.dtype)
-    
-    minmax_topn = np.full(1, 0, dtype=idx_dtype)
-
-    if not use_threads:
-
-        ct.sparse_dot_minmax_topn(
-            M, N, np.asarray(A.indptr, dtype=idx_dtype),
-            np.asarray(A.indices, dtype=idx_dtype),
-            A.data,
-            np.asarray(B.indptr, dtype=idx_dtype),
-            np.asarray(B.indices, dtype=idx_dtype),
-            B.data,
-            ntop,
-            lower_bound,
-            indptr, indices, data,
-            minmax_topn)
-
-    else:
-        if n_jobs < 1:
-            err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!'
-            raise ValueError(err_str)
-
-        ct_thread.sparse_dot_minmax_topn_threaded(
-            M, N, np.asarray(A.indptr, dtype=idx_dtype),
-            np.asarray(A.indices, dtype=idx_dtype),
-            A.data,
-            np.asarray(B.indptr, dtype=idx_dtype),
-            np.asarray(B.indices, dtype=idx_dtype),
-            B.data,
-            ntop,
-            lower_bound,
-            indptr, indices, data, minmax_topn, n_jobs)
-
-    return minmax_topn[0], csr_matrix((data, indices, indptr), shape=(M, N))
diff --git a/sparse_dot_topn/awesome_cossim_topn.py b/sparse_dot_topn/awesome_cossim_topn.py
index c4af03d4..b93298de 100644
--- a/sparse_dot_topn/awesome_cossim_topn.py
+++ b/sparse_dot_topn/awesome_cossim_topn.py
@@ -87,3 +87,65 @@ def awesome_cossim_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1):
             indptr, indices, data, n_jobs)
 
     return csr_matrix((data, indices, indptr), shape=(M, N))
+
+def awesome_cossim_true_minmax_topn_only(A, B, use_threads=False, n_jobs=1):
+    """
+    This function will return the maximum number of columns set
+    per row over all rows of A * B
+
+    Input:
+        A and B: two CSR matrix
+        use_threads: use multi-thread or not
+        n_jobs: number of thread, must be >= 1
+
+    Output:
+        minmax_topn: maximum number of columns set
+                     per row over all rows of A * B
+
+    N.B. if A and B are not CSR format, they will be converted to CSR
+    """
+    if not isspmatrix_csr(A):
+        A = A.tocsr()
+
+    if not isspmatrix_csr(B):
+        B = B.tocsr()
+
+    M, K1 = A.shape
+    K2, N = B.shape
+
+    if K1 != K2:
+        err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!'
+        raise ValueError(err_str)
+
+    idx_dtype = np.int32
+
+    minmax_topn = np.full(1, 0, dtype=idx_dtype)
+
+    # basic check. if A or B are all zeros matrix, return all zero matrix directly
+    if len(A.indices) == 0 or len(B.indices) == 0:
+        return 0
+
+    if not use_threads:
+
+        ct.sparse_dot_only_minmax_topn(
+            M, N,
+            np.asarray(A.indptr, dtype=idx_dtype),
+            np.asarray(A.indices, dtype=idx_dtype),
+            np.asarray(B.indptr, dtype=idx_dtype),
+            np.asarray(B.indices, dtype=idx_dtype),
+            minmax_topn)
+
+    else:
+        if n_jobs < 1:
+            err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!'
+            raise ValueError(err_str)
+
+        ct_thread.sparse_dot_only_minmax_topn_threaded(
+            M, N,
+            np.asarray(A.indptr, dtype=idx_dtype),
+            np.asarray(A.indices, dtype=idx_dtype),
+            np.asarray(B.indptr, dtype=idx_dtype),
+            np.asarray(B.indices, dtype=idx_dtype),
+            minmax_topn, n_jobs)
+
+    return minmax_topn[0]
diff --git a/sparse_dot_topn/sparse_dot_topn.pyx b/sparse_dot_topn/sparse_dot_topn.pyx
index 1da3181a..54771132 100644
--- a/sparse_dot_topn/sparse_dot_topn.pyx
+++ b/sparse_dot_topn/sparse_dot_topn.pyx
@@ -14,6 +14,8 @@
 
 # Author: Zhe Sun, Ahmet Erdem
 # April 20, 2017
+# Modified by: Particular Miner
+# April 14, 2021
 
 # distutils: language = c++
 
@@ -37,7 +39,7 @@ cdef extern from "sparse_dot_topn_source.h":
                         int Cj[],
                         double Cx[]);
 
-    cdef void sparse_dot_minmax_topn_source(
+    cdef void sparse_dot_plus_minmax_topn_source(
                         int n_row,
                         int n_col,
                         int Ap[],
@@ -53,6 +55,15 @@ cdef extern from "sparse_dot_topn_source.h":
                         double Cx[],
                         int minmax_topn[]);
 
+    cdef void sparse_dot_only_minmax_topn_source(
+                        int n_row,
+                        int n_col,
+                        int Ap[],
+                        int Aj[],
+                        int Bp[],
+                        int Bj[],
+                        int minmax_topn[]);
+
 cpdef sparse_dot_topn(
         int n_row,
         int n_col,
@@ -70,7 +81,7 @@ cpdef sparse_dot_topn(
     ):
     """
     Cython glue function to call sparse_dot_topn C++ implementation
-    This function will return a matrxi C in CSR format, where
+    This function will return a matrix C in CSR format, where
     C = [sorted top n results and results > lower_bound for each row of A * B]
 
     Input:
@@ -103,7 +114,7 @@ cpdef sparse_dot_topn(
     sparse_dot_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx)
     return
 
-cpdef sparse_dot_minmax_topn(
+cpdef sparse_dot_plus_minmax_topn(
         int n_row,
         int n_col,
         np.ndarray[int, ndim=1] a_indptr,
@@ -120,11 +131,11 @@ cpdef sparse_dot_minmax_topn(
         np.ndarray[int, ndim=1] minmax_topn
     ):
     """
-    Cython glue function to call sparse_dot_minmax_topn C++ implementation
+    Cython glue function to call sparse_dot_plus_minmax_topn C++ implementation
     This function will return a matrix C in CSR format, where
     C = [sorted top n results > lower_bound for each row of A * B].
     It also returns minmax_ntop (the maximum number of columns set
-    for each row of A * B when ntop is infinite)
+    per row over all rows of A * B assuming ntop is infinite)
 
     Input:
         n_row: number of rows of A matrix
@@ -138,8 +149,8 @@ cpdef sparse_dot_minmax_topn(
 
     Output by reference:
         c_indptr, c_indices, c_data: CSR expression of C matrix
-        minmax_ntop: the maximum number of columns set for each row of
-                     A * B when ntop is infinite
+        minmax_ntop: the maximum number of columns set per row over all rows of
+                     A * B assuming ntop is infinite
 
     N.B. A and B must be CSR format!!!
          The type of input numpy array must be aligned with types of C++ function aguments!
@@ -156,5 +167,43 @@ cpdef sparse_dot_minmax_topn(
     cdef double* Cx = &c_data[0]
     cdef int* o_minmax_topn = &minmax_topn[0]
 
-    sparse_dot_minmax_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, o_minmax_topn)
+    sparse_dot_plus_minmax_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, o_minmax_topn)
+    return
+
+cpdef sparse_dot_only_minmax_topn(
+        int n_row,
+        int n_col,
+        np.ndarray[int, ndim=1] a_indptr,
+        np.ndarray[int, ndim=1] a_indices,
+        np.ndarray[int, ndim=1] b_indptr,
+        np.ndarray[int, ndim=1] b_indices,
+        np.ndarray[int, ndim=1] minmax_topn
+    ):
+    """
+    Cython glue function to call sparse_dot_only_minmax_topn C++ implementation
+    This function will return the maximum number of columns set
+    per row over all rows of A * B
+
+    Input:
+        n_row: number of rows of A matrix
+        n_col: number of columns of B matrix
+
+        a_indptr, a_indices: CSR indices of A matrix
+        b_indptr, b_indices: CSR indices of B matrix
+
+    Output by reference:
+        minmax_ntop: the maximum number of columns set per row over all rows of 
+                     A * B
+
+    N.B. A and B must be CSR format!!!
+         The type of input numpy array must be aligned with types of C++ function arguments!
+    """
+
+    cdef int* Ap = &a_indptr[0]
+    cdef int* Aj = &a_indices[0]
+    cdef int* Bp = &b_indptr[0]
+    cdef int* Bj = &b_indices[0]
+    cdef int* o_minmax_topn = &minmax_topn[0]
+
+    sparse_dot_only_minmax_topn_source(n_row, n_col, Ap, Aj, Bp, Bj, o_minmax_topn)
     return
\ No newline at end of file
diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/sparse_dot_topn/sparse_dot_topn_parallel.cpp
index dc123b80..c2b9a0b9 100644
--- a/sparse_dot_topn/sparse_dot_topn_parallel.cpp
+++ b/sparse_dot_topn/sparse_dot_topn_parallel.cpp
@@ -17,11 +17,14 @@
 
 // Author: Zhe Sun, Ahmet Erdem
 // April 20, 2017
+// Modified by: Particular Miner
+// April 14, 2021
 
 #include <vector>
 #include <limits>
 #include <algorithm>
 #include <thread>
+#include <iostream>
 
 #include "./sparse_dot_topn_source.h"
 #include "./sparse_dot_topn_parallel.h"
@@ -274,7 +277,7 @@ for(int i = start_row; i < end_row; i++){
 
 }
 
-void sparse_dot_minmax_topn_parallel(int n_row,
+void sparse_dot_plus_minmax_topn_parallel(int n_row,
                         int n_col,
                         int Ap[],
                         int Aj[],
@@ -364,3 +367,84 @@ void sparse_dot_minmax_topn_parallel(int n_row,
     *minmax_ntop = *std::max_element(split_minmax_ntop.begin(), split_minmax_ntop.end());
 
 }
+
+void inner_sparse_only_minmax_function(int start_row, int end_row, int n_col_inner,
+									   int Ap_copy[], int Aj_copy[],
+									   int Bp_copy[], int Bj_copy[],
+									   int *minmax_ntop)
+{
+	std::vector<bool> unmarked(n_col_inner, true);
+
+	for(int i = start_row; i < end_row; i++){
+
+		int length =  0;
+
+		int jj_start = Ap_copy[i];
+		int jj_end   = Ap_copy[i+1];
+
+		for(int jj = jj_start; jj < jj_end; jj++){
+			int j = Aj_copy[jj];
+
+			int kk_start = Bp_copy[j];
+			int kk_end   = Bp_copy[j+1];
+			for(int kk = kk_start; kk < kk_end; kk++){
+				int k = Bj_copy[kk]; //kth column of B in row j
+
+				if(unmarked[k]){	// if this k is not already marked then ...
+					unmarked[k] = false;	// keep a record of column k
+					length++;
+				}
+			}
+		}
+		*minmax_ntop = (length > *minmax_ntop)? length : *minmax_ntop;
+	}
+}
+
+void sparse_dot_only_minmax_topn_parallel(int n_row,
+										  int n_col,
+										  int Ap[],
+										  int Aj[],
+										  int Bp[],
+										  int Bj[],
+										  int *minmax_ntop,
+										  int n_jobs)
+{
+	std::vector<int> job_load_sz(n_jobs, n_row/n_jobs);
+
+	int rem = n_row % n_jobs;
+	for (int r = 0; r < rem; r++) job_load_sz[r] += 1;
+
+	std::vector<std::vector<int>> split_row_vector(n_jobs);
+
+    std::vector<int> split_minmax_ntop(n_jobs, 0);
+
+    std::vector<std::thread> thread_list(n_jobs);
+
+    int start = 0;
+	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
+	    std::vector<int> temp_vector(2, 0);
+
+	    temp_vector[0] = start;
+	    temp_vector[1] = start + job_load_sz[job_nr];
+	    start = temp_vector[1];
+
+	    split_row_vector[job_nr] = temp_vector;
+	}
+
+	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
+
+
+	    int start_row = split_row_vector[job_nr][0];
+	    int end_row = split_row_vector[job_nr][1];
+
+	    thread_list[job_nr] = std::thread (inner_sparse_only_minmax_function,
+	    									start_row, end_row, n_col,
+	                                        Ap, Aj, Bp, Bj,
+											&split_minmax_ntop[job_nr]);
+
+    }
+
+    for (int job_nr = 0; job_nr < n_jobs; job_nr++) thread_list[job_nr].join();
+
+    *minmax_ntop = *std::max_element(split_minmax_ntop.begin(), split_minmax_ntop.end());
+}
diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.h b/sparse_dot_topn/sparse_dot_topn_parallel.h
index bd70b573..cb43cd1c 100644
--- a/sparse_dot_topn/sparse_dot_topn_parallel.h
+++ b/sparse_dot_topn/sparse_dot_topn_parallel.h
@@ -17,6 +17,8 @@
 
 // Author: Zhe Sun, Ahmet Erdem
 // April 20, 2017
+// Modified by: Particular Miner
+// April 14, 2021
 
 #ifndef UTILS_CPPCLASS_H
 #define UTILS_CPPCLASS_H
@@ -36,7 +38,7 @@ extern void sparse_dot_topn_parallel(int n_row,
       	                    double Cx[],
       	                    int n_jobs);
 
-extern void sparse_dot_minmax_topn_parallel(int n_row,
+extern void sparse_dot_plus_minmax_topn_parallel(int n_row,
       	              int n_col,
       	              int Ap[],
       	              int Aj[],
@@ -52,4 +54,13 @@ extern void sparse_dot_minmax_topn_parallel(int n_row,
 							int* minmax_topn,
       	                    int n_jobs);
 
+extern void sparse_dot_only_minmax_topn_parallel(int n_row,
+										  int n_col,
+										  int Ap[],
+										  int Aj[],
+										  int Bp[],
+										  int Bj[],
+										  int *minmax_ntop,
+										  int n_jobs);
+
 #endif //UTILS_CPPCLASS_H
diff --git a/sparse_dot_topn/sparse_dot_topn_source.cpp b/sparse_dot_topn/sparse_dot_topn_source.cpp
index e5cc3e12..dcf99637 100644
--- a/sparse_dot_topn/sparse_dot_topn_source.cpp
+++ b/sparse_dot_topn/sparse_dot_topn_source.cpp
@@ -17,6 +17,8 @@
 
 // Author: Zhe Sun, Ahmet Erdem
 // April 20, 2017
+// Modified by: Particular Miner
+// April 14, 2021
 
 #include <vector>
 #include <limits>
@@ -29,7 +31,7 @@ bool candidate_cmp(candidate c_i, candidate c_j) { return (c_i.value > c_j.value
 /*
     C++ implementation of sparse_dot_topn
 
-    This function will return a matrxi C in CSR format, where
+    This function will return a matrix C in CSR format, where
     C = [sorted top n results > lower_bound for each row of A * B]
 
     Input:
@@ -131,12 +133,12 @@ void sparse_dot_topn_source(int n_row,
 }
 
 /*
-    C++ implementation of sparse_dot_minmax_topn
+    C++ implementation of sparse_dot_plus_minmax_topn_source
 
     This function will return a matrix C in CSR format, where
     C = [sorted top n results > lower_bound for each row of A * B].
     It also returns minmax_ntop (the maximum number of columns set
-    for each row of A * B when ntop is infinite)
+    per row over all rows of A * B assuming ntop is infinite)
 
     Input:
         n_row: number of rows of A matrix
@@ -150,12 +152,12 @@ void sparse_dot_topn_source(int n_row,
 
     Output by reference:
         Cp, Cj, Cx: CSR expression of C matrix
-        minmax_ntop: the maximum number of columns set for each row of
-                     A * B when ntop is infinite
+        minmax_ntop: the maximum number of columns set per row over all
+                     rows of A * B assuming ntop is infinite
 
     N.B. A and B must be CSR format!!!
 */
-void sparse_dot_minmax_topn_source(int n_row,
+void sparse_dot_plus_minmax_topn_source(int n_row,
 									int n_col,
 									int Ap[],
 									int Aj[],
@@ -241,3 +243,57 @@ void sparse_dot_minmax_topn_source(int n_row,
         Cp[i+1] = nnz;
     }
 }
+
+/*
+    C++ implementation of sparse_dot_only_minmax_topn_source
+
+    This function will return the maximum number of columns set
+    per row over all rows of A * B
+
+    Input:
+        n_row: number of rows of A matrix
+        n_col: number of columns of B matrix
+
+        Ap, Aj, Ax: CSR expression of A matrix
+        Bp, Bj, Bx: CSR expression of B matrix
+
+    Output by reference:
+        minmax_ntop: the maximum number of columns set per row
+                     over all rows of A * B
+
+    N.B. A and B must be CSR format!!!
+*/
+void sparse_dot_only_minmax_topn_source(int n_row,
+									int n_col,
+									int Ap[],
+									int Aj[],
+									int Bp[],
+									int Bj[],
+									int *minmax_ntop)
+{
+    std::vector<bool> unmarked(n_col, true);
+
+    *minmax_ntop = 0;
+
+    for(int i = 0; i < n_row; i++){
+        int length =  0;
+
+        int jj_start = Ap[i];
+        int jj_end   = Ap[i+1];
+        for(int jj = jj_start; jj < jj_end; jj++){
+            int j = Aj[jj];
+
+            int kk_start = Bp[j];
+            int kk_end   = Bp[j+1];
+            for(int kk = kk_start; kk < kk_end; kk++){
+                int k = Bj[kk];	// kth column of B in row j
+
+                if(unmarked[k]){	// if this k is not already marked then ...
+                	unmarked[k] = false;	// keep a record of column k
+                    length++;
+                }
+            }
+        }
+        *minmax_ntop = (length > *minmax_ntop)? length : *minmax_ntop;
+    }
+}
diff --git a/sparse_dot_topn/sparse_dot_topn_source.h b/sparse_dot_topn/sparse_dot_topn_source.h
index d51de107..6143eb93 100644
--- a/sparse_dot_topn/sparse_dot_topn_source.h
+++ b/sparse_dot_topn/sparse_dot_topn_source.h
@@ -17,6 +17,8 @@
 
 // Author: Zhe Sun, Ahmet Erdem
 // April 20, 2017
+// Modified by: Particular Miner
+// April 14, 2021
 
 #ifndef UTILS_CPPCLASS_H
 #define UTILS_CPPCLASS_H
@@ -26,32 +28,40 @@ struct candidate {int index; double value;};
 extern bool candidate_cmp(candidate c_i, candidate c_j);
 
 extern void sparse_dot_topn_source(int n_row,
-      	              int n_col,
-      	              int Ap[],
-      	              int Aj[],
-      	              double Ax[],
-      	              int Bp[],
-      	              int Bj[],
-      	              double Bx[],
-                      int ntop,
-                      double lower_bound,
-      	                    int Cp[],
-      	                    int Cj[],
-      	                    double Cx[]);
-
-extern void sparse_dot_minmax_topn_source(int n_row,
-										  int n_col,
-										  int Ap[],
-										  int Aj[],
-										  double Ax[],
-										  int Bp[],
-										  int Bj[],
-										  double Bx[],
-										  int ntop,
-										  double lower_bound,
-												int Cp[],
-												int Cj[],
-												double Cx[],
-												int *minmax_topn);
+								   int n_col,
+								   int Ap[],
+								   int Aj[],
+								   double Ax[],	//data of A
+								   int Bp[],
+								   int Bj[],
+								   double Bx[],	//data of B
+								   int ntop,
+								   double lower_bound,
+										int Cp[],
+										int Cj[],
+										double Cx[]);	//data of C
+
+extern void sparse_dot_plus_minmax_topn_source(int n_row,
+											   int n_col,
+											   int Ap[],
+											   int Aj[],
+											   double Ax[], //data of A
+											   int Bp[],
+											   int Bj[],
+											   double Bx[], //data of B
+											   int ntop,
+											   double lower_bound,
+												   int Cp[],
+												   int Cj[],
+												   double Cx[], //data of C
+												   int *minmax_topn);
+
+extern void sparse_dot_only_minmax_topn_source(int n_row,
+											   int n_col,
+											   int Ap[],
+											   int Aj[],
+											   int Bp[],
+											   int Bj[],
+											   	   int *minmax_ntop);
 
 #endif //UTILS_CPPCLASS_H
diff --git a/sparse_dot_topn/sparse_dot_topn_threaded.pyx b/sparse_dot_topn/sparse_dot_topn_threaded.pyx
index 1cef2229..0bb45a6a 100644
--- a/sparse_dot_topn/sparse_dot_topn_threaded.pyx
+++ b/sparse_dot_topn/sparse_dot_topn_threaded.pyx
@@ -14,6 +14,8 @@
 
 # Author: Zhe Sun, Ahmet Erdem
 # April 20, 2017
+# Modified by: Particular Miner
+# April 14, 2021
 
 # distutils: language = c++
 
@@ -38,7 +40,7 @@ cdef extern from "sparse_dot_topn_parallel.h":
                         double Cx[],
                         int n_jobs);
 
-    cdef void sparse_dot_minmax_topn_parallel(
+    cdef void sparse_dot_plus_minmax_topn_parallel(
                         int n_row,
                         int n_col,
                         int Ap[],
@@ -55,6 +57,16 @@ cdef extern from "sparse_dot_topn_parallel.h":
                         int minmax_ntop[],
                         int n_jobs);
 
+    cdef void sparse_dot_only_minmax_topn_parallel(
+                        int n_row,
+                        int n_col,
+                        int Ap[],
+                        int Aj[],
+                        int Bp[],
+                        int Bj[],
+                        int minmax_ntop[],
+                        int n_jobs);
+
 cpdef sparse_dot_topn_threaded(
         int n_row,
         int n_col,
@@ -86,7 +98,7 @@ cpdef sparse_dot_topn_threaded(
                              lower_bound, Cp, Cj, Cx, n_jobs)
     return
 
-cpdef sparse_dot_minmax_topn_threaded(
+cpdef sparse_dot_plus_minmax_topn_threaded(
         int n_row,
         int n_col,
         np.ndarray[int, ndim=1] a_indptr,
@@ -115,6 +127,26 @@ cpdef sparse_dot_minmax_topn_threaded(
     cdef double* Cx = &c_data[0]
     cdef int* o_minmax_ntop = &minmax_ntop[0]
 
-    sparse_dot_minmax_topn_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop,
+    sparse_dot_plus_minmax_topn_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop,
                              lower_bound, Cp, Cj, Cx, o_minmax_ntop, n_jobs)
     return
+
+cpdef sparse_dot_only_minmax_topn_threaded(
+        int n_row,
+        int n_col,
+        np.ndarray[int, ndim=1] a_indptr,
+        np.ndarray[int, ndim=1] a_indices,
+        np.ndarray[int, ndim=1] b_indptr,
+        np.ndarray[int, ndim=1] b_indices,
+        np.ndarray[int, ndim=1] minmax_ntop,
+        int n_jobs
+    ):
+
+    cdef int* Ap = &a_indptr[0]
+    cdef int* Aj = &a_indices[0]
+    cdef int* Bp = &b_indptr[0]
+    cdef int* Bj = &b_indices[0]
+    cdef int* o_minmax_ntop = &minmax_ntop[0]
+
+    sparse_dot_only_minmax_topn_parallel(n_row, n_col, Ap, Aj, Bp, Bj, o_minmax_ntop, n_jobs)
+    return
diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index 73689cf5..53fc3600 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -6,9 +6,8 @@
 from scipy.sparse.csr import csr_matrix
 from scipy.sparse.csgraph import connected_components
 from typing import Tuple, NamedTuple, List, Optional, Union
-from sparse_dot_topn import awesome_cossim_minmax_topn
+from sparse_dot_topn import awesome_cossim_topn, awesome_cossim_true_minmax_topn_only
 from functools import wraps
-import time
 
 DEFAULT_NGRAM_SIZE: int = 3
 DEFAULT_REGEX: str = r'[,-./]|\s'
@@ -248,13 +247,11 @@ def n_grams(self, string: str) -> List[str]:
     def fit(self) -> 'StringGrouper':
         """Builds the _matches list which contains string matches indices and similarity"""
         master_matrix, duplicate_matrix = self._get_tf_idf_matrices()
-
         # Calculate the matches using the cosine similarity
-        self._true_max_n_matches, matches = self._build_matches(master_matrix, duplicate_matrix)
+        matches = self._build_matches(master_matrix, duplicate_matrix)
         if self._duplicates is None:
-            # the list of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A)
+            # the matrix of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A)
             matches = StringGrouper._symmetrize_matrix(matches)
-
         # build list from matrix
         self._matches_list = self._get_matches_list(matches)
         self.is_build = True
@@ -435,29 +432,30 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix
         tf_idf_matrix_1 = master_matrix
         tf_idf_matrix_2 = duplicate_matrix.transpose()
         
-        # if min_similarity <= 0 compute the true maximum number of matches over all strings in master:
-        if self._config.min_similarity <= 0:
-            tic = time.perf_counter()
-            self._true_max_n_matches = StringGrouper._get_true_max_n_matches(tf_idf_matrix_1, tf_idf_matrix_2)
-            toc = time.perf_counter()
-            print(f"1. _true_max_n_matches = {self._true_max_n_matches}; time: {toc - tic:0.4f} seconds", flush=True)
-            if self._config.max_n_matches is None:
-                self._max_n_matches = self._true_max_n_matches
-
         optional_kwargs = dict()
         if self._config.number_of_processes > 1:
             optional_kwargs = {
                 'use_threads': True,
                 'n_jobs': self._config.number_of_processes
             }
-        tic = time.perf_counter()
-        tup = awesome_cossim_minmax_topn(tf_idf_matrix_1, tf_idf_matrix_2,
-                                   self._max_n_matches,
-                                   self._config.min_similarity,
-                                   **optional_kwargs)
-        toc = time.perf_counter()
-        print(f"2. _true_max_n_matches = {tup[0]}; time: {toc - tic:0.4f} seconds", flush=True)
-        return tup
+
+        # if min_similarity <= 0 compute the true maximum number of matches over all strings in master:
+        if self._config.min_similarity <= 0:
+            self._true_max_n_matches = awesome_cossim_true_minmax_topn_only(
+                tf_idf_matrix_1,
+                tf_idf_matrix_2,
+                **optional_kwargs
+            )
+            # if kwarg max_n_matches was not set then set it now to true value
+            if self._config.max_n_matches is None:
+                self._max_n_matches = self._true_max_n_matches
+
+        return awesome_cossim_topn(
+            tf_idf_matrix_1, tf_idf_matrix_2,
+            self._max_n_matches,
+            self._config.min_similarity,
+            **optional_kwargs
+        )
 
     def _get_non_matches_list(self) -> pd.DataFrame:
         """Returns a list of all the indices of non-matching pairs (with similarity set to 0)"""
@@ -483,22 +481,6 @@ def _symmetrize_matrix(AA: csr_matrix) -> csr_matrix:
         A[c, r] = A[r, c]
         return A.tocsr()
 
-    @staticmethod
-    def _get_true_max_n_matches(AA: csr_matrix, BB: csr_matrix) -> int:
-        """Returns the true maximum number of matches over all strings in master"""
-        def get_n_matches(i: int) -> int:
-            a_cols = A.indices[A.indptr[i]:A.indptr[i+1]]
-            nz = np.full(N, 0, dtype=int)
-            for j in a_cols:
-                nz[B.indices[B.indptr[j]:B.indptr[j+1]]] = 1
-            return np.sum(nz)
-        
-        A, B = AA.tocsr(), BB.tocsr()
-        M, _ = A.shape
-        _, N = B.shape
-        v = np.vectorize(get_n_matches)
-        return np.amax(v(range(M)))
-        
     @staticmethod
     def _get_matches_list(matches: csr_matrix) -> pd.DataFrame:
         """Returns a list of all the indices of matches"""
diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py
index 452273ac..64b8caf8 100644
--- a/string_grouper/test/test_string_grouper.py
+++ b/string_grouper/test/test_string_grouper.py
@@ -336,7 +336,7 @@ def test_build_matches(self):
         expected_matches = np.array([[1., 0., 0.],
                                      [0., 1., 0.],
                                      [0., 0., 0.]])
-        np.testing.assert_array_equal(expected_matches, sg._build_matches(master, dupe)[1].toarray())
+        np.testing.assert_array_equal(expected_matches, sg._build_matches(master, dupe).toarray())
 
     def test_build_matches_list(self):
         """Should create the cosine similarity matrix of two series"""

From 798daf78fe9727be621b45d29db7b45652951f95 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Wed, 14 Apr 2021 19:17:51 +0200
Subject: [PATCH 05/29] updated setup.py

---
 setup.py                                   | 63 +++++++++++++++++++---
 sparse_dot_topn/__init__.py                |  7 +--
 sparse_dot_topn/awesome_cossim_topn.py     |  9 +---
 string_grouper/string_grouper.py           | 24 ++++-----
 string_grouper/test/test_string_grouper.py | 12 ++---
 5 files changed, 77 insertions(+), 38 deletions(-)

diff --git a/setup.py b/setup.py
index f4b5ecb0..535aa5c7 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,6 @@
-from setuptools import setup
+from setuptools import setup, Extension
 import pathlib
+import os
 
 # The directory containing this file
 HERE = pathlib.Path(__file__).parent
@@ -7,13 +8,53 @@
 # The text of the README file
 README = (HERE / "README.md").read_text()
 
+# workaround for numpy and Cython install dependency
+# the solution is from https://stackoverflow.com/a/54138355
+def my_build_ext(pars):
+    # import delayed:
+    from setuptools.command.build_ext import build_ext as _build_ext
+    class build_ext(_build_ext):
+        def finalize_options(self):
+            _build_ext.finalize_options(self)
+            # Prevent numpy from thinking it is still in its setup process:
+            __builtins__.__NUMPY_SETUP__ = False
+            import numpy
+            self.include_dirs.append(numpy.get_include())
+
+    #object returned:
+    return build_ext(pars)
+
+if os.name == 'nt':
+    extra_compile_args = ["-Ox"]
+else:
+    extra_compile_args = ['-std=c++0x', '-pthread', '-O3']
+
+original_ext = Extension('sparse_dot_topn.sparse_dot_topn',
+                         sources=['./sparse_dot_topn/sparse_dot_topn.pyx',
+                                  './sparse_dot_topn/sparse_dot_topn_source.cpp'],
+                         extra_compile_args=extra_compile_args,
+                         language='c++')
+
+threaded_ext = Extension('sparse_dot_topn.sparse_dot_topn_threaded',
+                         sources=[
+                             './sparse_dot_topn/sparse_dot_topn_threaded.pyx',
+                             './sparse_dot_topn/sparse_dot_topn_source.cpp',
+                             './sparse_dot_topn/sparse_dot_topn_parallel.cpp'],
+                         extra_compile_args=extra_compile_args,
+                         language='c++')
+
 setup(
     name='string_grouper',
     version='0.4.0',
-    packages=['string_grouper'],
+    packages=[
+        'string_grouper'
+        , 'string_grouper_utils'
+        , 'sparse_dot_topn'
+    ],
     license='MIT License',
     description='String grouper contains functions to do string matching using TF-IDF and the cossine similarity. '
                 'Based on https://bergvca.github.io/2017/10/14/super-fast-string-matching.html',
+    keywords='cosine-similarity sparse-matrix sparse-graph scipy cython',
     author='Chris van den Berg',
     long_description=README,
     long_description_content_type="text/markdown",
@@ -21,10 +62,20 @@
     url='https://github.com/Bergvca/string_grouper',
     zip_safe=False,
     python_requires='>3.7',
-    install_requires=['pandas>=0.25.3'
+    setup_requires=[# Setuptools 18.0 properly handles Cython extensions.
+                    'setuptools>=18.0'
+                    , 'cython>=0.29.15'
+                    , 'numpy'
+                    , 'scipy'
+    ],
+    install_requires=[# Setuptools 18.0 properly handles Cython extensions.
+                      'setuptools>=18.0'
+                      , 'cython>=0.29.15'
+                      , 'numpy'
                       , 'scipy'
                       , 'scikit-learn'
-                      , 'numpy'
-                      , 'sparse_dot_topn>=0.2.6'
-                      ]
+                      , 'pandas>=0.25.3'
+    ],
+    cmdclass={'build_ext': my_build_ext},
+    ext_modules=[original_ext, threaded_ext]
 )
diff --git a/sparse_dot_topn/__init__.py b/sparse_dot_topn/__init__.py
index 9cfee892..d7e882f0 100644
--- a/sparse_dot_topn/__init__.py
+++ b/sparse_dot_topn/__init__.py
@@ -1,7 +1,2 @@
 # flake8: noqa
-import sys
-
-if sys.version_info[0] >= 3:
-    from sparse_dot_topn.awesome_cossim_topn import awesome_cossim_topn, awesome_cossim_true_minmax_topn_only
-else:
-    from awesome_cossim_topn import awesome_cossim_topn, awesome_cossim_true_minmax_topn_only
\ No newline at end of file
+from sparse_dot_topn.awesome_cossim_topn import awesome_cossim_topn, awesome_cossim_true_minmax_topn_only
diff --git a/sparse_dot_topn/awesome_cossim_topn.py b/sparse_dot_topn/awesome_cossim_topn.py
index b93298de..ee6c2ca2 100644
--- a/sparse_dot_topn/awesome_cossim_topn.py
+++ b/sparse_dot_topn/awesome_cossim_topn.py
@@ -1,14 +1,9 @@
-import sys
 import numpy as np
 from scipy.sparse import csr_matrix
 from scipy.sparse import isspmatrix_csr
 
-if sys.version_info[0] >= 3:
-    from sparse_dot_topn import sparse_dot_topn as ct
-    from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread
-else:
-    import sparse_dot_topn as ct
-    import sparse_dot_topn_threaded as ct_thread
+from sparse_dot_topn import sparse_dot_topn as ct
+from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread
 
 
 def awesome_cossim_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1):
diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index 53fc3600..69ecd912 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -249,8 +249,8 @@ def fit(self) -> 'StringGrouper':
         master_matrix, duplicate_matrix = self._get_tf_idf_matrices()
         # Calculate the matches using the cosine similarity
         matches = self._build_matches(master_matrix, duplicate_matrix)
-        if self._duplicates is None:
-            # the matrix of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A)
+        if self._duplicates is None and self._max_n_matches < self._true_max_n_matches:
+            # the list of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A)
             matches = StringGrouper._symmetrize_matrix(matches)
         # build list from matrix
         self._matches_list = self._get_matches_list(matches)
@@ -439,16 +439,16 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix
                 'n_jobs': self._config.number_of_processes
             }
 
-        # if min_similarity <= 0 compute the true maximum number of matches over all strings in master:
-        if self._config.min_similarity <= 0:
-            self._true_max_n_matches = awesome_cossim_true_minmax_topn_only(
-                tf_idf_matrix_1,
-                tf_idf_matrix_2,
-                **optional_kwargs
-            )
-            # if kwarg max_n_matches was not set then set it now to true value
-            if self._config.max_n_matches is None:
-                self._max_n_matches = self._true_max_n_matches
+        # compute the true maximum number of matches over all strings in master:
+        self._true_max_n_matches = awesome_cossim_true_minmax_topn_only(
+            tf_idf_matrix_1,
+            tf_idf_matrix_2,
+            **optional_kwargs
+        )
+
+        if self._config.min_similarity <= 0 and self._config.max_n_matches is None:
+            # if kwarg max_n_matches was not set when min_similarity <= 0 then set it now to its true value
+            self._max_n_matches = self._true_max_n_matches
 
         return awesome_cossim_topn(
             tf_idf_matrix_1, tf_idf_matrix_2,
diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py
index 64b8caf8..c928bfa3 100644
--- a/string_grouper/test/test_string_grouper.py
+++ b/string_grouper/test/test_string_grouper.py
@@ -3,13 +3,11 @@
 import numpy as np
 from scipy.sparse.csr import csr_matrix
 from string_grouper.string_grouper import DEFAULT_MIN_SIMILARITY, \
-    DEFAULT_MAX_N_MATCHES, DEFAULT_REGEX, \
-    DEFAULT_NGRAM_SIZE, DEFAULT_N_PROCESSES, DEFAULT_IGNORE_CASE, \
+    DEFAULT_REGEX, DEFAULT_NGRAM_SIZE, DEFAULT_N_PROCESSES, DEFAULT_IGNORE_CASE, \
     StringGrouperConfig, StringGrouper, StringGrouperNotFitException, \
-    match_most_similar, group_similar_strings, match_strings,\
+    match_most_similar, group_similar_strings, match_strings, \
     compute_pairwise_similarities
 from unittest.mock import patch
-from scipy.sparse.csgraph._flow import csr_matrix
 
 def mock_symmetrize_matrix(A: csr_matrix) -> csr_matrix:
     return A
@@ -383,7 +381,7 @@ def test_get_matches_single(self):
         left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
         right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
         left_index = [0, 0, 1, 2, 3, 3]
-        right_index = [0, 3, 1, 2, 0, 3]
+        right_index = [3, 0, 1, 2, 3, 0]
         similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
         expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side,
                                     'similarity': similarity,
@@ -399,8 +397,8 @@ def test_get_matches_1_series_1_id_series(self):
         left_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3']
         left_index = [0, 0, 1, 2, 3, 3]
         right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
-        right_side_id = ['A0', 'A3', 'A1', 'A2', 'A0', 'A3']
-        right_index = [0, 3, 1, 2, 0, 3]
+        right_side_id = ['A3', 'A0', 'A1', 'A2', 'A3', 'A0']
+        right_index = [3, 0, 1, 2, 3, 0]
         similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
         expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id,
                                     'similarity': similarity,

From 5b2ce38bfeb19eec321c68f2db5bf76cc45faa20 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Sat, 17 Apr 2021 09:01:02 +0200
Subject: [PATCH 06/29] attempted to remove n_max_matches restriction
 altogether

---
 sparse_dot_topn/.gitignore                 |   4 -
 sparse_dot_topn/awesome_cossim_topn.py     |  85 +++++++-
 sparse_dot_topn/sparse_dot_topn.pyx        |  73 ++++---
 sparse_dot_topn/sparse_dot_topn_source.cpp | 213 +++++++++++++++++++--
 sparse_dot_topn/sparse_dot_topn_source.h   |  27 ++-
 5 files changed, 338 insertions(+), 64 deletions(-)
 delete mode 100644 sparse_dot_topn/.gitignore

diff --git a/sparse_dot_topn/.gitignore b/sparse_dot_topn/.gitignore
deleted file mode 100644
index 97caf501..00000000
--- a/sparse_dot_topn/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-/sparse_dot_topn_threaded.cpp
-/sparse_dot_topn.cp39-win_amd64.pyd
-/sparse_dot_topn.cpp
-/sparse_dot_topn_threaded.cp39-win_amd64.pyd
diff --git a/sparse_dot_topn/awesome_cossim_topn.py b/sparse_dot_topn/awesome_cossim_topn.py
index ee6c2ca2..808f5d8b 100644
--- a/sparse_dot_topn/awesome_cossim_topn.py
+++ b/sparse_dot_topn/awesome_cossim_topn.py
@@ -40,6 +40,88 @@ def awesome_cossim_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1):
 
     nnz_max = M*ntop
 
+    # basic check. if A or B are all zeros matrix, return all zero matrix directly
+    if len(A.indices) == 0 or len(B.indices) == 0:
+        indptr = np.zeros(M + 1, dtype=idx_dtype)
+        indices = np.zeros(nnz_max, dtype=idx_dtype)
+        data = np.zeros(nnz_max, dtype=A.dtype)
+        return csr_matrix((data, indices, indptr), shape=(M, N))
+
+    # indptr is the only array whose length is known
+    indptr = np.empty(M+1, dtype=idx_dtype)
+
+    if not False:
+
+        indices, data = ct.sparse_dot_free(
+            M, N, np.asarray(A.indptr, dtype=idx_dtype),
+            np.asarray(A.indices, dtype=idx_dtype),
+            A.data,
+            np.asarray(B.indptr, dtype=idx_dtype),
+            np.asarray(B.indices, dtype=idx_dtype),
+            B.data,
+            lower_bound,
+            indptr)
+        # print(f'(M, N) = {(M, N)}')
+        # print(f'indptr = {indptr}')
+        # print(f'indptr.flags = {indptr.flags}')
+        # print(f'indices = {indices}')
+        # print(f'indices.flags = {indices.flags}')
+        # print(f'data = {data}')
+        # print(f'data.flags = {data.flags}')
+
+    else:
+        if n_jobs < 1:
+            err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!'
+            raise ValueError(err_str)
+
+        ct_thread.sparse_dot_topn_threaded(
+            M, N, np.asarray(A.indptr, dtype=idx_dtype),
+            np.asarray(A.indices, dtype=idx_dtype),
+            A.data,
+            np.asarray(B.indptr, dtype=idx_dtype),
+            np.asarray(B.indices, dtype=idx_dtype),
+            B.data,
+            ntop,
+            lower_bound,
+            indptr, indices, data, n_jobs)
+
+    return csr_matrix((data, indices, indptr), shape=(M, N))
+
+
+def suspend_awesome_cossim_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1):
+    """
+    This function will return a matrxi C in CSR format, where
+    C = [sorted top n results and results > lower_bound for each row of A * B]
+
+    Input:
+        A and B: two CSR matrix
+        ntop: n top results
+        lower_bound: a threshold that the element of A*B must greater than
+        use_threads: use multi-thread or not
+        n_jobs: number of thread, must be >= 1
+
+    Output:
+        C: result matrix
+
+    N.B. if A and B are not CSR format, they will be converted to CSR
+    """
+    if not isspmatrix_csr(A):
+        A = A.tocsr()
+
+    if not isspmatrix_csr(B):
+        B = B.tocsr()
+
+    M, K1 = A.shape
+    K2, N = B.shape
+
+    if K1 != K2:
+        err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!'
+        raise ValueError(err_str)
+
+    idx_dtype = np.int32
+
+    nnz_max = M*ntop
+
     # basic check. if A or B are all zeros matrix, return all zero matrix directly
     if len(A.indices) == 0 or len(B.indices) == 0:
         indptr = np.zeros(M + 1, dtype=idx_dtype)
@@ -83,6 +165,7 @@ def awesome_cossim_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1):
 
     return csr_matrix((data, indices, indptr), shape=(M, N))
 
+
 def awesome_cossim_true_minmax_topn_only(A, B, use_threads=False, n_jobs=1):
     """
     This function will return the maximum number of columns set
@@ -116,7 +199,7 @@ def awesome_cossim_true_minmax_topn_only(A, B, use_threads=False, n_jobs=1):
 
     minmax_topn = np.full(1, 0, dtype=idx_dtype)
 
-    # basic check. if A or B are all zeros matrix, return all zero matrix directly
+    # basic check. if A or B are all zeros matrix, return 0 directly
     if len(A.indices) == 0 or len(B.indices) == 0:
         return 0
 
diff --git a/sparse_dot_topn/sparse_dot_topn.pyx b/sparse_dot_topn/sparse_dot_topn.pyx
index 54771132..59ed57bf 100644
--- a/sparse_dot_topn/sparse_dot_topn.pyx
+++ b/sparse_dot_topn/sparse_dot_topn.pyx
@@ -19,9 +19,17 @@
 
 # distutils: language = c++
 
-import numpy as np
+from libc.stdio cimport printf
+from libcpp.vector cimport vector
+from libc.stdlib cimport free
+from cpython.pycapsule cimport PyCapsule_New, PyCapsule_IsValid, PyCapsule_GetPointer, PyCapsule_GetName
 cimport numpy as np
 
+np.import_array()
+
+cdef extern from "numpy/arrayobject.h":
+    void PyArray_ENABLEFLAGS(np.ndarray arr, int flags)
+
 cdef extern from "sparse_dot_topn_source.h":
 
     cdef void sparse_dot_topn_source(
@@ -39,7 +47,7 @@ cdef extern from "sparse_dot_topn_source.h":
                         int Cj[],
                         double Cx[]);
 
-    cdef void sparse_dot_plus_minmax_topn_source(
+    cdef void sparse_dot_free_source(
                         int n_row,
                         int n_col,
                         int Ap[],
@@ -48,12 +56,10 @@ cdef extern from "sparse_dot_topn_source.h":
                         int Bp[],
                         int Bj[],
                         double Bx[],
-                        int topn,
                         double lower_bound,
                         int Cp[],
-                        int Cj[],
-                        double Cx[],
-                        int minmax_topn[]);
+                        vector[int]* Cj,
+                        vector[double]* Cx);
 
     cdef void sparse_dot_only_minmax_topn_source(
                         int n_row,
@@ -114,7 +120,13 @@ cpdef sparse_dot_topn(
     sparse_dot_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx)
     return
 
-cpdef sparse_dot_plus_minmax_topn(
+# destructor
+cdef void free_ptr(object cap):
+    # This should probably have some error checking in
+    # or at very least clear any errors raised once it's done
+    free(PyCapsule_GetPointer(cap, PyCapsule_GetName(cap)))
+
+cpdef sparse_dot_free(
         int n_row,
         int n_col,
         np.ndarray[int, ndim=1] a_indptr,
@@ -123,19 +135,13 @@ cpdef sparse_dot_plus_minmax_topn(
         np.ndarray[int, ndim=1] b_indptr,
         np.ndarray[int, ndim=1] b_indices,
         np.ndarray[double, ndim=1] b_data,
-        int ntop,
         double lower_bound,
-        np.ndarray[int, ndim=1] c_indptr,
-        np.ndarray[int, ndim=1] c_indices,
-        np.ndarray[double, ndim=1] c_data,
-        np.ndarray[int, ndim=1] minmax_topn
+        np.ndarray[int, ndim=1] c_indptr
     ):
     """
-    Cython glue function to call sparse_dot_plus_minmax_topn C++ implementation
+    Cython glue function to call sparse_dot_topn C++ implementation
     This function will return a matrix C in CSR format, where
-    C = [sorted top n results > lower_bound for each row of A * B].
-    It also returns minmax_ntop (the maximum number of columns set
-    per row over all rows of A * B assuming ntop is infinite)
+    C = [all results > lower_bound for each row of A * B]
 
     Input:
         n_row: number of rows of A matrix
@@ -144,16 +150,13 @@ cpdef sparse_dot_plus_minmax_topn(
         a_indptr, a_indices, a_data: CSR expression of A matrix
         b_indptr, b_indices, b_data: CSR expression of B matrix
 
-        ntop: n top results
         lower_bound: a threshold that the element of A*B must greater than
 
     Output by reference:
         c_indptr, c_indices, c_data: CSR expression of C matrix
-        minmax_ntop: the maximum number of columns set per row over all rows of
-                     A * B assuming ntop is infinite
 
     N.B. A and B must be CSR format!!!
-         The type of input numpy array must be aligned with types of C++ function aguments!
+         The type of input numpy array must be aligned with types of C++ function arguments!
     """
 
     cdef int* Ap = &a_indptr[0]
@@ -163,12 +166,32 @@ cpdef sparse_dot_plus_minmax_topn(
     cdef int* Bj = &b_indices[0]
     cdef double* Bx = &b_data[0]
     cdef int* Cp = &c_indptr[0]
-    cdef int* Cj = &c_indices[0]
-    cdef double* Cx = &c_data[0]
-    cdef int* o_minmax_topn = &minmax_topn[0]
+    
+    cdef vector[int] vCj;
+    cdef vector[double] vCx;
+
+    sparse_dot_free_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx)
+    
+    cdef np.npy_intp nnz = Cp[n_row]
+    cdef np.ndarray[np.int32_t, ndim=1] c_indices = np.PyArray_SimpleNewFromData(1, &nnz, np.NPY_INT32, vCj.data())
+    PyArray_ENABLEFLAGS(c_indices, np.NPY_OWNDATA)
+    cdef np.ndarray[np.double_t, ndim=1] c_data = np.PyArray_SimpleNewFromData(1, &nnz, np.NPY_DOUBLE, vCx.data())
+    PyArray_ENABLEFLAGS(c_data, np.NPY_OWNDATA)
+    
+    # cdef const char *name_vCj_capsule = "vCj"
+    # cdef int* vCj_data = vCj.data()
+    # vCj_capsule = PyCapsule_New(<void *> vCj_data, name_vCj_capsule, &free_ptr)
+    # if not PyCapsule_IsValid(vCj_capsule, name_vCj_capsule):
+        # raise ValueError(f"invalid pointer ({name_vCj_capsule}) to parameters")
+        #
+    # cdef const char *name_vCx_capsule = "vCx"
+    # cdef double* vCx_data = vCx.data()
+    # vCx_capsule = PyCapsule_New(<void *> vCx_data, name_vCx_capsule, &free_ptr)
+    # if not PyCapsule_IsValid(vCx_capsule, name_vCx_capsule):
+        # raise ValueError(f"invalid pointer ({name_vCx_capsule}) to parameters")
+    
+    return c_indices, c_data
 
-    sparse_dot_plus_minmax_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, o_minmax_topn)
-    return
 
 cpdef sparse_dot_only_minmax_topn(
         int n_row,
diff --git a/sparse_dot_topn/sparse_dot_topn_source.cpp b/sparse_dot_topn/sparse_dot_topn_source.cpp
index dcf99637..c4544790 100644
--- a/sparse_dot_topn/sparse_dot_topn_source.cpp
+++ b/sparse_dot_topn/sparse_dot_topn_source.cpp
@@ -133,12 +133,10 @@ void sparse_dot_topn_source(int n_row,
 }
 
 /*
-    C++ implementation of sparse_dot_plus_minmax_topn_source
+    C++ implementation of sparse_dot_source
 
     This function will return a matrix C in CSR format, where
-    C = [sorted top n results > lower_bound for each row of A * B].
-    It also returns minmax_ntop (the maximum number of columns set
-    per row over all rows of A * B assuming ntop is infinite)
+    C = [all results > lower_bound sorted for each row of A * B].
 
     Input:
         n_row: number of rows of A matrix
@@ -147,17 +145,15 @@ void sparse_dot_topn_source(int n_row,
         Ap, Aj, Ax: CSR expression of A matrix
         Bp, Bj, Bx: CSR expression of B matrix
 
-        ntop: n top results
+        memory_bound: the maximum number of elements per row of C
         lower_bound: a threshold that the element of A*B must greater than
 
     Output by reference:
         Cp, Cj, Cx: CSR expression of C matrix
-        minmax_ntop: the maximum number of columns set per row over all
-                     rows of A * B assuming ntop is infinite
 
     N.B. A and B must be CSR format!!!
 */
-void sparse_dot_plus_minmax_topn_source(int n_row,
+void sparse_dot_source(int n_row,
 									int n_col,
 									int Ap[],
 									int Aj[],
@@ -165,24 +161,22 @@ void sparse_dot_plus_minmax_topn_source(int n_row,
 									int Bp[],
 									int Bj[],
 									double Bx[], //data of B
-			                        int ntop,
+									int memory_bound,
 									double lower_bound,
 									int Cp[],
 									int Cj[],
-									double Cx[],
-									int *minmax_ntop)
+									double Cx[])
 {
     std::vector<int> next(n_col,-1);
     std::vector<double> sums(n_col, 0);
 
     std::vector<candidate> candidates;
+    candidates.reserve(memory_bound);
 
     int nnz = 0;
 
     Cp[0] = 0;
 
-    *minmax_ntop = 0;
-
     for(int i = 0; i < n_row; i++){
         int head   = -2;
         int length =  0;
@@ -207,7 +201,6 @@ void sparse_dot_plus_minmax_topn_source(int n_row,
                 }
             }
         }
-        *minmax_ntop = (length > *minmax_ntop)? length : *minmax_ntop;
 
         for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
 
@@ -226,12 +219,7 @@ void sparse_dot_plus_minmax_topn_source(int n_row,
         }
 
         int len = (int)candidates.size();
-        if (len > ntop){
-            std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp);
-            len = ntop;
-        } else {
-            std::sort(candidates.begin(), candidates.end(), candidate_cmp);
-        }
+        std::sort(candidates.begin(), candidates.end(), candidate_cmp);
 
         for(int a=0; a < len; a++){
             Cj[nnz] = candidates[a].index;
@@ -244,6 +232,191 @@ void sparse_dot_plus_minmax_topn_source(int n_row,
     }
 }
 
+/*
+    C++ implementation of sparse_dot_free_source
+
+    This function will return a matrix C in CSR format, where
+    C = [all results > lower_bound sorted for each row of A * B].
+
+    Input:
+        n_row: number of rows of A matrix
+        n_col: number of columns of B matrix
+
+        Ap, Aj, Ax: CSR expression of A matrix
+        Bp, Bj, Bx: CSR expression of B matrix
+
+        memory_bound: the maximum number of elements per row of C
+        lower_bound: a threshold that the element of A*B must greater than
+
+    Output by reference:
+        Cp: C array for idx_pointer of CSR expression of C matrix
+        Cj: numpy array for indices of CSR expression of C matrix
+        Cx: numpy array for data values of CSR expression of C matrix
+
+    N.B. A and B must be CSR format!!!
+*/
+void sparse_dot_free_source(int n_row,
+									int n_col,
+									int Ap[],
+									int Aj[],
+									double Ax[], //data of A
+									int Bp[],
+									int Bj[],
+									double Bx[], //data of B
+									double lower_bound,
+									int Cp[],
+									std::vector<int>* Cj,
+									std::vector<double>* Cx)
+{
+	int sz = std::max(n_row, n_col);
+	Cj->reserve(sz);
+	Cx->reserve(sz);
+
+    std::vector<int> next(n_col,-1);
+    std::vector<double> sums(n_col, 0);
+
+    std::vector<candidate> candidates;
+
+    Cp[0] = 0;
+
+    for(int i = 0; i < n_row; i++){
+        int head   = -2;
+        int length =  0;
+
+        int jj_start = Ap[i];
+        int jj_end   = Ap[i+1];
+        for(int jj = jj_start; jj < jj_end; jj++){
+            int j = Aj[jj];
+            double v = Ax[jj]; //value of A in (i,j)
+
+            int kk_start = Bp[j];
+            int kk_end   = Bp[j+1];
+            for(int kk = kk_start; kk < kk_end; kk++){
+                int k = Bj[kk]; //kth column of B in row j
+
+                sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
+
+                if(next[k] == -1){
+                    next[k] = head; //keep a linked list, every element points to the next column index
+                    head  = k;
+                    length++;
+                }
+            }
+        }
+
+        for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
+
+            if(sums[head] > lower_bound){ //append the nonzero elements
+                candidate c;
+                c.index = head;
+                c.value = sums[head];
+                candidates.push_back(c);
+            }
+
+            int temp = head;
+            head = next[head]; //iterate over columns
+
+            next[temp] = -1; //clear arrays
+            sums[temp] =  0; //clear arrays
+        }
+
+        int len = (int)candidates.size();
+        std::sort(candidates.begin(), candidates.end(), candidate_cmp);
+
+        for(int a=0; a < len; a++){
+            Cj->push_back(candidates[a].index);
+            Cx->push_back(candidates[a].value);
+        }
+        candidates.clear();
+
+        Cp[i+1] = Cj->size();
+    }
+}
+
+/*
+    C++ implementation of sparse_dot_nnz_source
+
+    This function will return the number nnz of nonzero elements
+    of the matrix C in CSR format, where
+    C = [all results > lower_bound sorted for each row of A * B]
+    and ntop the maximum number of elements per row of C.
+    This function is designed primarily to help with memory management for
+    very large sparse matrices.
+
+    Input:
+        n_row: number of rows of A matrix
+        n_col: number of columns of B matrix
+
+        Ap, Aj, Ax: CSR expression of A matrix
+        Bp, Bj, Bx: CSR expression of B matrix
+
+        lower_bound: a threshold that the element of A*B must greater than
+
+    Output:
+        nnz: number of nonzero elements of matrix C
+        ntop: maximum number of elements per row of C
+
+    N.B. A and B must be CSR format!!!
+*/
+void sparse_dot_nnz_source(int n_row,
+									int n_col,
+									int Ap[],
+									int Aj[],
+									double Ax[], //data of A
+									int Bp[],
+									int Bj[],
+									double Bx[], //data of B
+									double lower_bound,
+									int* nnz,
+									int* ntop)
+{
+    std::vector<int> next(n_col,-1);
+    std::vector<double> sums(n_col, 0);
+
+    *nnz = 0;
+    *ntop = 0;
+
+    for(int i = 0; i < n_row; i++){
+        int head   = -2;
+        int length =  0;
+
+        int jj_start = Ap[i];
+        int jj_end   = Ap[i+1];
+        for(int jj = jj_start; jj < jj_end; jj++){
+            int j = Aj[jj];
+            double v = Ax[jj]; //value of A in (i,j)
+
+            int kk_start = Bp[j];
+            int kk_end   = Bp[j+1];
+            for(int kk = kk_start; kk < kk_end; kk++){
+                int k = Bj[kk]; //kth column of B in row j
+
+                sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
+
+                if(next[k] == -1){
+                    next[k] = head; //keep a linked list, every element points to the next column index
+                    head  = k;
+                    length++;
+                }
+            }
+        }
+
+        int nnz_k = 0;
+        for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
+
+            if(sums[head] > lower_bound) nnz_k++; //count this nonzero element in
+
+            int temp = head;
+            head = next[head]; //iterate over columns
+
+            next[temp] = -1; //clear arrays
+            sums[temp] =  0; //clear arrays
+        }
+        *ntop = (nnz_k > *ntop)? nnz_k : *ntop;
+        *nnz += nnz_k;
+    }
+}
+
 /*
     C++ implementation of sparse_dot_only_minmax_topn_source
 
diff --git a/sparse_dot_topn/sparse_dot_topn_source.h b/sparse_dot_topn/sparse_dot_topn_source.h
index 6143eb93..664378e3 100644
--- a/sparse_dot_topn/sparse_dot_topn_source.h
+++ b/sparse_dot_topn/sparse_dot_topn_source.h
@@ -23,6 +23,7 @@
 #ifndef UTILS_CPPCLASS_H
 #define UTILS_CPPCLASS_H
 
+
 struct candidate {int index; double value;};
 
 extern bool candidate_cmp(candidate c_i, candidate c_j);
@@ -41,20 +42,18 @@ extern void sparse_dot_topn_source(int n_row,
 										int Cj[],
 										double Cx[]);	//data of C
 
-extern void sparse_dot_plus_minmax_topn_source(int n_row,
-											   int n_col,
-											   int Ap[],
-											   int Aj[],
-											   double Ax[], //data of A
-											   int Bp[],
-											   int Bj[],
-											   double Bx[], //data of B
-											   int ntop,
-											   double lower_bound,
-												   int Cp[],
-												   int Cj[],
-												   double Cx[], //data of C
-												   int *minmax_topn);
+extern void sparse_dot_free_source(int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		double Ax[], //data of A
+		int Bp[],
+		int Bj[],
+		double Bx[], //data of B
+		double lower_bound,
+		int Cp[],
+		std::vector<int>* Cj,
+		std::vector<double>* Cx);
 
 extern void sparse_dot_only_minmax_topn_source(int n_row,
 											   int n_col,

From d6f31278636287880e5f66d9ef6290dedcb52732 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Sat, 17 Apr 2021 09:48:44 +0200
Subject: [PATCH 07/29] removed the restriction n_max_matches put on memory
 allocation

---
 setup.py                                      |  19 +-
 sparse_dot_topn/__init__.py                   |   7 +-
 sparse_dot_topn/array_wrappers.pxd            |  18 +
 sparse_dot_topn/array_wrappers.pyx            |  73 ++
 sparse_dot_topn/awesome_cossim_topn.py        | 263 +++----
 sparse_dot_topn/example/comparison.py         | 137 ++++
 sparse_dot_topn/example/comparison2.py        | 169 ++++
 sparse_dot_topn/example/example.py            |  14 +
 sparse_dot_topn/sparse_dot_topn.pyx           | 261 ++++---
 sparse_dot_topn/sparse_dot_topn_parallel.cpp  | 731 +++++++++++-------
 sparse_dot_topn/sparse_dot_topn_parallel.h    |  97 ++-
 sparse_dot_topn/sparse_dot_topn_source.cpp    | 159 ++--
 sparse_dot_topn/sparse_dot_topn_source.h      |  68 +-
 sparse_dot_topn/sparse_dot_topn_threaded.pyx  | 239 +++---
 .../test/test_awesome_cossim_topn.py          | 346 +++++++++
 string_grouper/string_grouper.py              |  25 +-
 string_grouper/test/test_string_grouper.py    |   2 +-
 17 files changed, 1892 insertions(+), 736 deletions(-)
 create mode 100644 sparse_dot_topn/array_wrappers.pxd
 create mode 100644 sparse_dot_topn/array_wrappers.pyx
 create mode 100644 sparse_dot_topn/example/comparison.py
 create mode 100644 sparse_dot_topn/example/comparison2.py
 create mode 100644 sparse_dot_topn/example/example.py
 create mode 100644 sparse_dot_topn/test/test_awesome_cossim_topn.py

diff --git a/setup.py b/setup.py
index 535aa5c7..5cb9c5e0 100644
--- a/setup.py
+++ b/setup.py
@@ -29,9 +29,19 @@ def finalize_options(self):
 else:
     extra_compile_args = ['-std=c++0x', '-pthread', '-O3']
 
+array_wrappers_ext = Extension('sparse_dot_topn.array_wrappers',
+                         sources=[
+                                    './sparse_dot_topn/array_wrappers.pyx',
+                                    './sparse_dot_topn/sparse_dot_topn_source.cpp'
+                                ],
+                         extra_compile_args=extra_compile_args,
+                         language='c++')
+
 original_ext = Extension('sparse_dot_topn.sparse_dot_topn',
-                         sources=['./sparse_dot_topn/sparse_dot_topn.pyx',
-                                  './sparse_dot_topn/sparse_dot_topn_source.cpp'],
+                         sources=[
+                                    './sparse_dot_topn/sparse_dot_topn.pyx',
+                                    './sparse_dot_topn/sparse_dot_topn_source.cpp'
+                                ],
                          extra_compile_args=extra_compile_args,
                          language='c++')
 
@@ -39,7 +49,8 @@ def finalize_options(self):
                          sources=[
                              './sparse_dot_topn/sparse_dot_topn_threaded.pyx',
                              './sparse_dot_topn/sparse_dot_topn_source.cpp',
-                             './sparse_dot_topn/sparse_dot_topn_parallel.cpp'],
+                             './sparse_dot_topn/sparse_dot_topn_parallel.cpp'
+                            ],
                          extra_compile_args=extra_compile_args,
                          language='c++')
 
@@ -77,5 +88,5 @@ def finalize_options(self):
                       , 'pandas>=0.25.3'
     ],
     cmdclass={'build_ext': my_build_ext},
-    ext_modules=[original_ext, threaded_ext]
+    ext_modules=[array_wrappers_ext, original_ext, threaded_ext]
 )
diff --git a/sparse_dot_topn/__init__.py b/sparse_dot_topn/__init__.py
index d7e882f0..cbaf32a7 100644
--- a/sparse_dot_topn/__init__.py
+++ b/sparse_dot_topn/__init__.py
@@ -1,2 +1,7 @@
 # flake8: noqa
-from sparse_dot_topn.awesome_cossim_topn import awesome_cossim_topn, awesome_cossim_true_minmax_topn_only
+import sys
+
+if sys.version_info[0] >= 3:
+    from sparse_dot_topn.awesome_cossim_topn import awesome_cossim_topn
+else:
+    from awesome_cossim_topn import awesome_cossim_topn
\ No newline at end of file
diff --git a/sparse_dot_topn/array_wrappers.pxd b/sparse_dot_topn/array_wrappers.pxd
new file mode 100644
index 00000000..f3342ef5
--- /dev/null
+++ b/sparse_dot_topn/array_wrappers.pxd
@@ -0,0 +1,18 @@
+from libcpp.vector cimport vector
+
+# define a Cython array wrapper class to hold a C++ vector of ints, adhering to numpy's buffer protocol:
+cdef class ArrayWrapper_int:
+    cdef int view_count
+    cdef vector[int] vec
+    cdef Py_ssize_t shape[2]
+    cdef Py_ssize_t strides[2]
+
+
+# define a Cython array wrapper class to hold a C++ vector of doubles, adhering to numpy's buffer protocol:
+cdef class ArrayWrapper_double:
+    cdef int view_count
+    cdef vector[double] vec
+    cdef Py_ssize_t shape[2]
+    cdef Py_ssize_t strides[2]
+
+
diff --git a/sparse_dot_topn/array_wrappers.pyx b/sparse_dot_topn/array_wrappers.pyx
new file mode 100644
index 00000000..d0dd4f3e
--- /dev/null
+++ b/sparse_dot_topn/array_wrappers.pyx
@@ -0,0 +1,73 @@
+from cpython cimport Py_buffer
+from libcpp.vector cimport vector
+
+# define a Cython array wrapper class to hold a C++ vector of ints, adhering to numpy's buffer protocol:
+cdef class ArrayWrapper_int:
+    # constructor and destructor are fairly unimportant now since
+    # vec will be destroyed automatically.
+
+    def __cinit__(self, vector[int]& data):
+        self.vec.swap(data)
+        self.view_count = 0
+
+    # now implement the buffer protocol for the class
+    # which makes it generally useful to anything that expects an array
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        # relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class
+        cdef Py_ssize_t itemsize = sizeof(self.vec[0])
+
+        self.shape[1] = self.vec.size()
+        self.shape[0] = 1
+        self.strides[1] = <Py_ssize_t>(  <char *>&(self.vec[1]) - <char *>&(self.vec[0]))
+        self.strides[0] = self.vec.size() * self.strides[1]
+        buffer.buf = <char *>&(self.vec[0])
+        buffer.format = 'i'
+        buffer.internal = NULL
+        buffer.itemsize = itemsize
+        buffer.len = self.vec.size() * itemsize   # product(shape) * itemsize
+        buffer.ndim = 2
+        buffer.obj = self
+        buffer.readonly = 0
+        buffer.shape = self.shape
+        buffer.strides = self.strides
+        buffer.suboffsets = NULL
+        self.view_count += 1
+        
+    def __releasebuffer__(self, Py_buffer *buffer):
+        self.view_count -= 1
+
+
+# define a Cython array wrapper class to hold a C++ vector of doubles, adhering to numpy's buffer protocol:
+cdef class ArrayWrapper_double:
+    # constructor and destructor are fairly unimportant now since
+    # vec will be destroyed automatically.
+
+    def __cinit__(self, vector[double]& data):
+        self.vec.swap(data)
+        self.view_count = 0
+
+    # now implement the buffer protocol for the class
+    # which makes it generally useful to anything that expects an array
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        # relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class
+        cdef Py_ssize_t itemsize = sizeof(self.vec[0])
+
+        self.shape[1] = self.vec.size()
+        self.shape[0] = 1
+        self.strides[1] = <Py_ssize_t>(  <char *>&(self.vec[1]) - <char *>&(self.vec[0]))
+        self.strides[0] = self.vec.size() * self.strides[1]
+        buffer.buf = <char *>&(self.vec[0])
+        buffer.format = 'd'
+        buffer.internal = NULL
+        buffer.itemsize = itemsize
+        buffer.len = self.vec.size() * itemsize   # product(shape) * itemsize
+        buffer.ndim = 2
+        buffer.obj = self
+        buffer.readonly = 0
+        buffer.shape = self.shape
+        buffer.strides = self.strides
+        buffer.suboffsets = NULL
+        self.view_count += 1
+        
+    def __releasebuffer__(self, Py_buffer *buffer):
+        self.view_count -= 1
diff --git a/sparse_dot_topn/awesome_cossim_topn.py b/sparse_dot_topn/awesome_cossim_topn.py
index 808f5d8b..6e459b29 100644
--- a/sparse_dot_topn/awesome_cossim_topn.py
+++ b/sparse_dot_topn/awesome_cossim_topn.py
@@ -1,27 +1,55 @@
+import sys
 import numpy as np
 from scipy.sparse import csr_matrix
 from scipy.sparse import isspmatrix_csr
 
-from sparse_dot_topn import sparse_dot_topn as ct
-from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread
-
-
-def awesome_cossim_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1):
+if sys.version_info[0] >= 3:
+    from sparse_dot_topn import sparse_dot_topn as ct
+    from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread
+else:
+    import sparse_dot_topn as ct
+    import sparse_dot_topn_threaded as ct_thread
+
+
+def awesome_cossim_topn(
+        A,
+        B,
+        ntop,
+        lower_bound=0,
+        use_threads=False,
+        n_jobs=1,
+        ntop_is_flexible=False,
+        mem_manager_is_C=False,
+        return_best_topn=False
+    ):
     """
-    This function will return a matrxi C in CSR format, where
-    C = [sorted top n results and results > lower_bound for each row of A * B]
+    This function will return a matrix C in CSR format, where
+    C = [sorted top n results > lower_bound for each row of A * B].
+    If return_best_topn=True it will also return best_topn (the 
+    true maximum number of elements > lower_bound per row of A * B).
 
     Input:
-        A and B: two CSR matrix
+        A and B: two CSR matrices
         ntop: n top results
         lower_bound: a threshold that the element of A*B must greater than
-        use_threads: use multi-thread or not
+                     use_threads: use multi-thread or not
         n_jobs: number of thread, must be >= 1
+        ntop_is_flexible: if True, memory management will be handed over to C/C++ if 
+                          python's attempt at allocating memory fails.
+        mem_manager_is_C: (this is mainly for testing purposes) if True, will force
+                          memory management to be handed over to C/C++. Should be
+                          used only when ntop >= number of columns of B or 
+                          ntop_is_flexible=True.  Defaults to False.
+        return_best_topn: if True, will return best_topn together with C as a tuple:
+                          (C, best_topn)
 
     Output:
-        C: result matrix
+        C: result matrix (returned alone, if return_best_topn=False)
+        best_topn: The true maximum number of elements > lower_bound per row of 
+                   A * B returned together with C as a tuple: (C, best_topn). It is 
+                   returned only if return_best_topn=True.
 
-    N.B. if A and B are not CSR format, they will be converted to CSR
+    N.B. if A and B are not in CSR format, they will be converted to CSR
     """
     if not isspmatrix_csr(A):
         A = A.tocsr()
@@ -45,128 +73,105 @@ def awesome_cossim_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1):
         indptr = np.zeros(M + 1, dtype=idx_dtype)
         indices = np.zeros(nnz_max, dtype=idx_dtype)
         data = np.zeros(nnz_max, dtype=A.dtype)
-        return csr_matrix((data, indices, indptr), shape=(M, N))
-
-    # indptr is the only array whose length is known
-    indptr = np.empty(M+1, dtype=idx_dtype)
-
-    if not False:
-
-        indices, data = ct.sparse_dot_free(
-            M, N, np.asarray(A.indptr, dtype=idx_dtype),
-            np.asarray(A.indices, dtype=idx_dtype),
-            A.data,
-            np.asarray(B.indptr, dtype=idx_dtype),
-            np.asarray(B.indices, dtype=idx_dtype),
-            B.data,
-            lower_bound,
-            indptr)
-        # print(f'(M, N) = {(M, N)}')
-        # print(f'indptr = {indptr}')
-        # print(f'indptr.flags = {indptr.flags}')
-        # print(f'indices = {indices}')
-        # print(f'indices.flags = {indices.flags}')
-        # print(f'data = {data}')
-        # print(f'data.flags = {data.flags}')
-
-    else:
-        if n_jobs < 1:
-            err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!'
-            raise ValueError(err_str)
-
-        ct_thread.sparse_dot_topn_threaded(
-            M, N, np.asarray(A.indptr, dtype=idx_dtype),
-            np.asarray(A.indices, dtype=idx_dtype),
-            A.data,
-            np.asarray(B.indptr, dtype=idx_dtype),
-            np.asarray(B.indices, dtype=idx_dtype),
-            B.data,
-            ntop,
-            lower_bound,
-            indptr, indices, data, n_jobs)
-
-    return csr_matrix((data, indices, indptr), shape=(M, N))
-
-
-def suspend_awesome_cossim_topn(A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1):
-    """
-    This function will return a matrxi C in CSR format, where
-    C = [sorted top n results and results > lower_bound for each row of A * B]
-
-    Input:
-        A and B: two CSR matrix
-        ntop: n top results
-        lower_bound: a threshold that the element of A*B must greater than
-        use_threads: use multi-thread or not
-        n_jobs: number of thread, must be >= 1
-
-    Output:
-        C: result matrix
-
-    N.B. if A and B are not CSR format, they will be converted to CSR
-    """
-    if not isspmatrix_csr(A):
-        A = A.tocsr()
-
-    if not isspmatrix_csr(B):
-        B = B.tocsr()
-
-    M, K1 = A.shape
-    K2, N = B.shape
-
-    if K1 != K2:
-        err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!'
-        raise ValueError(err_str)
-
-    idx_dtype = np.int32
-
-    nnz_max = M*ntop
-
-    # basic check. if A or B are all zeros matrix, return all zero matrix directly
-    if len(A.indices) == 0 or len(B.indices) == 0:
-        indptr = np.zeros(M + 1, dtype=idx_dtype)
-        indices = np.zeros(nnz_max, dtype=idx_dtype)
-        data = np.zeros(nnz_max, dtype=A.dtype)
-        return csr_matrix((data, indices, indptr), shape=(M, N))
+        output = csr_matrix((data, indices, indptr), shape=(M, N))
+        if return_best_topn:
+            return output, 0
+        else:
+            return output
 
     # filled matrices from here on
     indptr = np.empty(M+1, dtype=idx_dtype)
-    indices = np.empty(nnz_max, dtype=idx_dtype)
-    data = np.empty(nnz_max, dtype=A.dtype)
-
-    if not use_threads:
-
-        ct.sparse_dot_topn(
-            M, N, np.asarray(A.indptr, dtype=idx_dtype),
-            np.asarray(A.indices, dtype=idx_dtype),
-            A.data,
-            np.asarray(B.indptr, dtype=idx_dtype),
-            np.asarray(B.indices, dtype=idx_dtype),
-            B.data,
-            ntop,
-            lower_bound,
-            indptr, indices, data)
-
+    try:
+        indices = np.empty(nnz_max, dtype=idx_dtype)
+        data = np.empty(nnz_max, dtype=A.dtype)
+        if mem_manager_is_C: raise MemoryError    # This is mainly for testing purposes
+    except MemoryError:
+        # if mem_manager_is_C: print('Exception raised! Continuing ...', flush=True)
+        if ntop_is_flexible or ntop >= N:
+        # It is likely you are here because nnz_max is too large. But don't give up just yet! 
+        # sparse_dot_topn will hand over the memory allocation/management to C++.  C++ will
+        # grow the memory allocations for these arrays as needed without any need for nnz_max.
+        # Note that reallocations could occur causing data to be copied to other locations 
+        # in memory thus impacting performance
+            indices = np.empty(0, dtype=idx_dtype)
+            data = np.empty(0, dtype=A.dtype)
+            if not use_threads:
+    
+                indices, data, best_topn = ct.sparse_dot_free(
+                    M, N, np.asarray(A.indptr, dtype=idx_dtype),
+                    np.asarray(A.indices, dtype=idx_dtype),
+                    A.data,
+                    np.asarray(B.indptr, dtype=idx_dtype),
+                    np.asarray(B.indices, dtype=idx_dtype),
+                    B.data,
+                    lower_bound,
+                    indptr
+                )
+                
+            else:
+    
+                indices, data, best_topn = ct_thread.sparse_dot_free_threaded(
+                    M, N, np.asarray(A.indptr, dtype=idx_dtype),
+                    np.asarray(A.indices, dtype=idx_dtype),
+                    A.data,
+                    np.asarray(B.indptr, dtype=idx_dtype),
+                    np.asarray(B.indices, dtype=idx_dtype),
+                    B.data,
+                    lower_bound,
+                    indptr, n_jobs
+                )
+
+        else:
+            if mem_manager_is_C:
+                raise Exception('When mem_manager_is_C=True, set ntop >= N, or set ntop_is_flexible=True')
+            else:
+                raise Exception('Not enough memory!  Data array is too large. Try reducing the value of ntop.')
+            
     else:
-        if n_jobs < 1:
-            err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!'
-            raise ValueError(err_str)
-
-        ct_thread.sparse_dot_topn_threaded(
-            M, N, np.asarray(A.indptr, dtype=idx_dtype),
-            np.asarray(A.indices, dtype=idx_dtype),
-            A.data,
-            np.asarray(B.indptr, dtype=idx_dtype),
-            np.asarray(B.indices, dtype=idx_dtype),
-            B.data,
-            ntop,
-            lower_bound,
-            indptr, indices, data, n_jobs)
-
-    return csr_matrix((data, indices, indptr), shape=(M, N))
+        
+        best_topn_arr = np.full(1, 0, dtype=idx_dtype)
+        
+        if not use_threads:
+        
+            ct.sparse_dot_topn_extd(
+                M, N, np.asarray(A.indptr, dtype=idx_dtype),
+                np.asarray(A.indices, dtype=idx_dtype),
+                A.data,
+                np.asarray(B.indptr, dtype=idx_dtype),
+                np.asarray(B.indices, dtype=idx_dtype),
+                B.data,
+                ntop,
+                lower_bound,
+                indptr, indices, data, best_topn_arr
+            )
+    
+        else:
+            if n_jobs < 1:
+                err_str = 'Whenever you select the multi-thread mode, n_job must be greater than or equal to 1!'
+                raise ValueError(err_str)
+    
+            ct_thread.sparse_dot_topn_extd_threaded(
+                M, N, np.asarray(A.indptr, dtype=idx_dtype),
+                np.asarray(A.indices, dtype=idx_dtype),
+                A.data,
+                np.asarray(B.indptr, dtype=idx_dtype),
+                np.asarray(B.indices, dtype=idx_dtype),
+                B.data,
+                ntop,
+                lower_bound,
+                indptr, indices, data, best_topn_arr, n_jobs
+            )
+        
+        best_topn = best_topn_arr[0]
+    
+    output = csr_matrix((data, indices, indptr), shape=(M, N))
+    if return_best_topn:
+        return output, best_topn
+    else:
+        return output
 
 
-def awesome_cossim_true_minmax_topn_only(A, B, use_threads=False, n_jobs=1):
+def awesome_cossim_only_max_nnz_col(A, B, use_threads=False, n_jobs=1):
     """
     This function will return the maximum number of columns set
     per row over all rows of A * B
@@ -205,7 +210,7 @@ def awesome_cossim_true_minmax_topn_only(A, B, use_threads=False, n_jobs=1):
 
     if not use_threads:
 
-        ct.sparse_dot_only_minmax_topn(
+        ct.sparse_dot_only_max_nnz_col(
             M, N,
             np.asarray(A.indptr, dtype=idx_dtype),
             np.asarray(A.indices, dtype=idx_dtype),
@@ -218,7 +223,7 @@ def awesome_cossim_true_minmax_topn_only(A, B, use_threads=False, n_jobs=1):
             err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!'
             raise ValueError(err_str)
 
-        ct_thread.sparse_dot_only_minmax_topn_threaded(
+        ct_thread.sparse_dot_only_max_nnz_col_threaded(
             M, N,
             np.asarray(A.indptr, dtype=idx_dtype),
             np.asarray(A.indices, dtype=idx_dtype),
diff --git a/sparse_dot_topn/example/comparison.py b/sparse_dot_topn/example/comparison.py
new file mode 100644
index 00000000..7ee673ca
--- /dev/null
+++ b/sparse_dot_topn/example/comparison.py
@@ -0,0 +1,137 @@
+"""
+This file compare our boosting method with calling scipy+numpy function directly
+"""
+
+from __future__ import print_function
+import timeit
+import numpy as np
+from scipy.sparse import coo_matrix
+from sparse_dot_topn import awesome_cossim_topn  # noqa: F401
+
+N = 1000
+thresh = 0.01
+
+nr_vocab = 2 << 24
+density = 1e-6
+n_samples = 1000000
+n_duplicates = 1000000
+nnz_a = int(n_samples * nr_vocab * density)
+nnz_b = int(n_duplicates * nr_vocab * density)
+
+
+print(f'density = {density}', flush=True)
+print(f'nr_vocab = {nr_vocab}', flush=True)
+print(f'n_samples = {n_samples}', flush=True)
+print(f'n_duplicates = {n_duplicates}', flush=True)
+print(f'nnz_a = {nnz_a}', flush=True)
+print(f'nnz_b = {nnz_b}', flush=True)
+print('\n', flush=True)
+
+rng1 = np.random.RandomState(42)
+rng2 = np.random.RandomState(43)
+
+row = rng1.randint(n_samples, size=nnz_a)
+cols = rng2.randint(nr_vocab, size=nnz_a)
+data = rng1.rand(nnz_a)
+
+a_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab))
+a = a_sparse.tocsr()
+
+row = rng1.randint(n_duplicates, size=nnz_b)
+cols = rng2.randint(nr_vocab, size=nnz_b)
+data = rng1.rand(nnz_b)
+
+b_sparse = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab))
+b = b_sparse.T.tocsr()
+
+
+# top 5 results per row
+
+print("Original sparse_dot_topn function")
+
+rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh)',
+                    number=3,
+                    globals=globals())
+print(rtv)
+
+print("Threaded function with 1 thread")
+
+rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1)',
+                    number=3,
+                    globals=globals())
+print(rtv)
+
+print("Threaded function with 2 threads")
+
+rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2)',
+                    number=3,
+                    globals=globals())
+print(rtv)
+
+print("Threaded function with 3 threads")
+
+rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3)',
+                    number=3,
+                    globals=globals())
+print(rtv)
+
+print("Threaded function with 4 threads")
+
+rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4)',
+                    number=3,
+                    globals=globals())
+print(rtv)
+
+print("Threaded function with 5 threads")
+
+rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5)',
+                    number=3,
+                    globals=globals())
+print(rtv)
+
+print("Threaded function with 6 threads")
+
+rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6)',
+                    number=3,
+                    globals=globals())
+print(rtv)
+
+print("Threaded function with 7 threads")
+
+rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7)',
+                    number=3,
+                    globals=globals())
+print(rtv)
+
+# use scipy and numpy function
+
+
+def get_csr_ntop_idx_data(csr_row, ntop):
+    """
+    Get list (row index, score) of the n top matches
+    """
+    nnz = csr_row.getnnz()
+    if nnz == 0:
+        return None
+    elif nnz <= ntop:
+        result = zip(csr_row.indices, csr_row.data)
+    else:
+        arg_idx = np.argpartition(csr_row.data, -ntop)[-ntop:]
+        result = zip(csr_row.indices[arg_idx], csr_row.data[arg_idx])
+
+    return sorted(result, key=lambda x: -x[1])
+
+
+def scipy_cossim_top(A, B, ntop, lower_bound=0):
+    C = A.dot(B)
+    return [get_csr_ntop_idx_data(row, ntop) for row in C]
+
+# top 5 results per row which element is greater than 2
+
+
+print("Scipy+numpy original function")
+
+rtv = timeit.timeit('scipy_cossim_top(a, b, N, thresh)',
+                    number=3,
+                    globals=globals())
+print(rtv)
diff --git a/sparse_dot_topn/example/comparison2.py b/sparse_dot_topn/example/comparison2.py
new file mode 100644
index 00000000..7af5d08a
--- /dev/null
+++ b/sparse_dot_topn/example/comparison2.py
@@ -0,0 +1,169 @@
+"""
+This file compare our boosting method with calling scipy+numpy function directly
+"""
+
+from __future__ import print_function
+import timeit
+import numpy as np
+from scipy.sparse import coo_matrix
+from sparse_dot_topn import awesome_cossim_topn  # noqa: F401
+
+N = 1000
+thresh = 0.01
+
+nr_vocab = 2 << 24
+density = 1e-6
+n_samples = 1000000
+n_duplicates = N
+nnz_a = int(n_samples * nr_vocab * density)
+nnz_b = int(n_duplicates * nr_vocab * density)
+
+print(f'density = {density}', flush=True)
+print(f'nr_vocab = {nr_vocab}', flush=True)
+print(f'n_samples = {n_samples}', flush=True)
+print(f'n_duplicates = {n_duplicates}', flush=True)
+print(f'nnz_a = {nnz_a}', flush=True)
+print(f'nnz_b = {nnz_b}', flush=True)
+print('', flush=True)
+
+rng1 = np.random.RandomState(42)
+rng2 = np.random.RandomState(43)
+
+row = rng1.randint(n_samples, size=nnz_a)
+cols = rng2.randint(nr_vocab, size=nnz_a)
+data = rng1.rand(nnz_a)
+
+a_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab))
+a = a_sparse.tocsr()
+
+row = rng1.randint(n_duplicates, size=nnz_b)
+cols = rng2.randint(nr_vocab, size=nnz_b)
+data = rng1.rand(nnz_b)
+
+b_sparse = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab))
+b = b_sparse.T.tocsr()
+
+
+# top 5 results per row
+
+print("Non-parallelized sparse_dot_topn function")
+
+rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh)',
+                    number=3,
+                    globals=globals())
+rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, mem_manager_is_C=True)',
+                    number=3,
+                    globals=globals())
+print('python\t\tC/C++', flush=True)
+print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+
+print("Threaded function with 1 thread")
+
+rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1)',
+                    number=3,
+                    globals=globals())
+rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1, mem_manager_is_C=True)',
+                    number=3,
+                    globals=globals())
+print('python\t\tC/C++', flush=True)
+print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+
+print("Threaded function with 2 threads")
+
+rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2)',
+                    number=3,
+                    globals=globals())
+rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2, mem_manager_is_C=True)',
+                    number=3,
+                    globals=globals())
+print('python\t\tC/C++', flush=True)
+print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+
+print("Threaded function with 3 threads")
+
+rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3)',
+                    number=3,
+                    globals=globals())
+rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3, mem_manager_is_C=True)',
+                    number=3,
+                    globals=globals())
+print('python\t\tC/C++', flush=True)
+print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+
+print("Threaded function with 4 threads")
+
+rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4)',
+                    number=3,
+                    globals=globals())
+rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4, mem_manager_is_C=True)',
+                    number=3,
+                    globals=globals())
+print('python\t\tC/C++', flush=True)
+print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+
+print("Threaded function with 5 threads")
+
+rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5)',
+                    number=3,
+                    globals=globals())
+rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5, mem_manager_is_C=True)',
+                    number=3,
+                    globals=globals())
+print('python\t\tC/C++', flush=True)
+print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+
+print("Threaded function with 6 threads")
+
+rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6)',
+                    number=3,
+                    globals=globals())
+rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6, mem_manager_is_C=True)',
+                    number=3,
+                    globals=globals())
+print('python\t\tC/C++', flush=True)
+print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+
+print("Threaded function with 7 threads")
+
+rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7)',
+                    number=3,
+                    globals=globals())
+rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7, mem_manager_is_C=True)',
+                    number=3,
+                    globals=globals())
+print('python\t\tC/C++', flush=True)
+print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+
+
+# use scipy and numpy function
+
+
+def get_csr_ntop_idx_data(csr_row, ntop):
+    """
+    Get list (row index, score) of the n top matches
+    """
+    nnz = csr_row.getnnz()
+    if nnz == 0:
+        return None
+    elif nnz <= ntop:
+        result = zip(csr_row.indices, csr_row.data)
+    else:
+        arg_idx = np.argpartition(csr_row.data, -ntop)[-ntop:]
+        result = zip(csr_row.indices[arg_idx], csr_row.data[arg_idx])
+
+    return sorted(result, key=lambda x: -x[1])
+
+
+def scipy_cossim_top(A, B, ntop, lower_bound=0):
+    C = A.dot(B)
+    return [get_csr_ntop_idx_data(row, ntop) for row in C]
+
+# top 5 results per row which element is greater than 2
+
+
+print("Scipy+numpy original function")
+
+rtv = timeit.timeit('scipy_cossim_top(a, b, N, thresh)',
+                    number=3,
+                    globals=globals())
+print(rtv)
diff --git a/sparse_dot_topn/example/example.py b/sparse_dot_topn/example/example.py
new file mode 100644
index 00000000..a61951fd
--- /dev/null
+++ b/sparse_dot_topn/example/example.py
@@ -0,0 +1,14 @@
+from scipy.sparse import rand
+from sparse_dot_topn import awesome_cossim_topn
+
+N = 10
+a = rand(100, 1000000, density=0.005, format='csr')
+b = rand(1000000, 200, density=0.005, format='csr')
+
+# Use standard implementation
+
+c = awesome_cossim_topn(a, b, 5, 0.01)
+
+# Use parallel implementation with 4 threads
+
+d = awesome_cossim_topn(a, b, 5, 0.01, use_threads=True, n_jobs=4)
diff --git a/sparse_dot_topn/sparse_dot_topn.pyx b/sparse_dot_topn/sparse_dot_topn.pyx
index 59ed57bf..9c35d3e9 100644
--- a/sparse_dot_topn/sparse_dot_topn.pyx
+++ b/sparse_dot_topn/sparse_dot_topn.pyx
@@ -19,72 +19,91 @@
 
 # distutils: language = c++
 
-from libc.stdio cimport printf
 from libcpp.vector cimport vector
-from libc.stdlib cimport free
-from cpython.pycapsule cimport PyCapsule_New, PyCapsule_IsValid, PyCapsule_GetPointer, PyCapsule_GetName
+from array_wrappers cimport ArrayWrapper_int, ArrayWrapper_double
+
 cimport numpy as np
+import numpy as np
 
 np.import_array()
 
-cdef extern from "numpy/arrayobject.h":
-    void PyArray_ENABLEFLAGS(np.ndarray arr, int flags)
 
 cdef extern from "sparse_dot_topn_source.h":
 
     cdef void sparse_dot_topn_source(
-                        int n_row,
-                        int n_col,
-                        int Ap[],
-                        int Aj[],
-                        double Ax[],
-                        int Bp[],
-                        int Bj[],
-                        double Bx[],
-                        int topn,
-                        double lower_bound,
-                        int Cp[],
-                        int Cj[],
-                        double Cx[]);
+                                        int n_row,
+                                        int n_col,
+                                        int Ap[],
+                                        int Aj[],
+                                        double Ax[],
+                                        int Bp[],
+                                        int Bj[],
+                                        double Bx[],
+                                        int topn,
+                                        double lower_bound,
+                                        int Cp[],
+                                        int Cj[],
+                                        double Cx[]
+                                    );
+
+    cdef void sparse_dot_topn_extd_source(
+                                        int n_row,
+                                        int n_col,
+                                        int Ap[],
+                                        int Aj[],
+                                        double Ax[],
+                                        int Bp[],
+                                        int Bj[],
+                                        double Bx[],
+                                        int topn,
+                                        double lower_bound,
+                                        int Cp[],
+                                        int Cj[],
+                                        double Cx[],
+                                        int* nminmax
+                                    );
 
     cdef void sparse_dot_free_source(
-                        int n_row,
-                        int n_col,
-                        int Ap[],
-                        int Aj[],
-                        double Ax[],
-                        int Bp[],
-                        int Bj[],
-                        double Bx[],
-                        double lower_bound,
-                        int Cp[],
-                        vector[int]* Cj,
-                        vector[double]* Cx);
+                                        int n_row,
+                                        int n_col,
+                                        int Ap[],
+                                        int Aj[],
+                                        double Ax[],
+                                        int Bp[],
+                                        int Bj[],
+                                        double Bx[],
+                                        double lower_bound,
+                                        int Cp[],
+                                        vector[int]* Cj,
+                                        vector[double]* Cx,
+                                        int* n_minmax
+                                    );
+
+    cdef void sparse_dot_only_max_nnz_col_source(
+                                                    int n_row,
+                                                    int n_col,
+                                                    int Ap[],
+                                                    int Aj[],
+                                                    int Bp[],
+                                                    int Bj[],
+                                                    int* max_nnz_col
+                                                );
 
-    cdef void sparse_dot_only_minmax_topn_source(
+cpdef sparse_dot_topn(
                         int n_row,
                         int n_col,
-                        int Ap[],
-                        int Aj[],
-                        int Bp[],
-                        int Bj[],
-                        int minmax_topn[]);
-
-cpdef sparse_dot_topn(
-        int n_row,
-        int n_col,
-        np.ndarray[int, ndim=1] a_indptr,
-        np.ndarray[int, ndim=1] a_indices,
-        np.ndarray[double, ndim=1] a_data,
-        np.ndarray[int, ndim=1] b_indptr,
-        np.ndarray[int, ndim=1] b_indices,
-        np.ndarray[double, ndim=1] b_data,
-        int ntop,
-        double lower_bound,
-        np.ndarray[int, ndim=1] c_indptr,
-        np.ndarray[int, ndim=1] c_indices,
-        np.ndarray[double, ndim=1] c_data
-    ):
+                        np.ndarray[int, ndim=1] a_indptr,
+                        np.ndarray[int, ndim=1] a_indices,
+                        np.ndarray[double, ndim=1] a_data,
+                        np.ndarray[int, ndim=1] b_indptr,
+                        np.ndarray[int, ndim=1] b_indices,
+                        np.ndarray[double, ndim=1] b_data,
+                        int ntop,
+                        double lower_bound,
+                        np.ndarray[int, ndim=1] c_indptr,
+                        np.ndarray[int, ndim=1] c_indices,
+                        np.ndarray[double, ndim=1] c_data
+                    ):
     """
     Cython glue function to call sparse_dot_topn C++ implementation
     This function will return a matrix C in CSR format, where
@@ -104,7 +123,7 @@ cpdef sparse_dot_topn(
         c_indptr, c_indices, c_data: CSR expression of C matrix
 
     N.B. A and B must be CSR format!!!
-         The type of input numpy array must be aligned with types of C++ function aguments!
+         The type of input numpy array must be aligned with types of C++ function arguments!
     """
 
     cdef int* Ap = &a_indptr[0]
@@ -120,28 +139,79 @@ cpdef sparse_dot_topn(
     sparse_dot_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx)
     return
 
-# destructor
-cdef void free_ptr(object cap):
-    # This should probably have some error checking in
-    # or at very least clear any errors raised once it's done
-    free(PyCapsule_GetPointer(cap, PyCapsule_GetName(cap)))
+cpdef sparse_dot_topn_extd(
+                        int n_row,
+                        int n_col,
+                        np.ndarray[int, ndim=1] a_indptr,
+                        np.ndarray[int, ndim=1] a_indices,
+                        np.ndarray[double, ndim=1] a_data,
+                        np.ndarray[int, ndim=1] b_indptr,
+                        np.ndarray[int, ndim=1] b_indices,
+                        np.ndarray[double, ndim=1] b_data,
+                        int ntop,
+                        double lower_bound,
+                        np.ndarray[int, ndim=1] c_indptr,
+                        np.ndarray[int, ndim=1] c_indices,
+                        np.ndarray[double, ndim=1] c_data,
+                        np.ndarray[int, ndim=1] nminmax,
+                    ):
+    """
+    Cython glue function to call sparse_dot_topn C++ implementation
+    This function will return a matrix C in CSR format, where
+    C = [sorted top n results > lower_bound for each row of A * B]
+    The maximum number of elements per row of C nminmax is also returned.
+
+    Input:
+        n_row: number of rows of A matrix
+        n_col: number of columns of B matrix
+
+        a_indptr, a_indices, a_data: CSR expression of A matrix
+        b_indptr, b_indices, b_data: CSR expression of B matrix
+
+        ntop: n top results
+        lower_bound: a threshold that the element of A*B must greater than
+
+    Output by reference:
+        c_indptr, c_indices, c_data: CSR expression of C matrix
+        nminmax: The maximum number of elements per row of C
+
+    N.B. A and B must be CSR format!!!
+         The type of input numpy array must be aligned with types of C++ function arguments!
+    """
+
+    cdef int* Ap = &a_indptr[0]
+    cdef int* Aj = &a_indices[0]
+    cdef double* Ax = &a_data[0]
+    cdef int* Bp = &b_indptr[0]
+    cdef int* Bj = &b_indices[0]
+    cdef double* Bx = &b_data[0]
+    cdef int* Cp = &c_indptr[0]
+    cdef int* Cj = &c_indices[0]
+    cdef double* Cx = &c_data[0]
+    cdef int* n_minmax = &nminmax[0]
+
+    sparse_dot_topn_extd_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax)
+    return
 
 cpdef sparse_dot_free(
-        int n_row,
-        int n_col,
-        np.ndarray[int, ndim=1] a_indptr,
-        np.ndarray[int, ndim=1] a_indices,
-        np.ndarray[double, ndim=1] a_data,
-        np.ndarray[int, ndim=1] b_indptr,
-        np.ndarray[int, ndim=1] b_indices,
-        np.ndarray[double, ndim=1] b_data,
-        double lower_bound,
-        np.ndarray[int, ndim=1] c_indptr
-    ):
+                        int n_row,
+                        int n_col,
+                        np.ndarray[int, ndim=1] a_indptr,
+                        np.ndarray[int, ndim=1] a_indices,
+                        np.ndarray[double, ndim=1] a_data,
+                        np.ndarray[int, ndim=1] b_indptr,
+                        np.ndarray[int, ndim=1] b_indices,
+                        np.ndarray[double, ndim=1] b_data,
+                        double lower_bound,
+                        np.ndarray[int, ndim=1] c_indptr
+                    ):
     """
-    Cython glue function to call sparse_dot_topn C++ implementation
+    Cython glue function to call sparse_dot_free C++ implementation
     This function will return a matrix C in CSR format, where
     C = [all results > lower_bound for each row of A * B]
+    This function lets C++ decide how to manage (grow/allocate/reallocate) memory for the 
+    storage of these results as needed during the computation; then hands over to numpy
+    a pointer to the memory location where the data resides  
 
     Input:
         n_row: number of rows of A matrix
@@ -166,42 +236,29 @@ cpdef sparse_dot_free(
     cdef int* Bj = &b_indices[0]
     cdef double* Bx = &b_data[0]
     cdef int* Cp = &c_indptr[0]
+    cdef np.ndarray[int, ndim=1] nminmax = np.array([0], dtype=np.int32)
+    cdef int* n_minmax = &nminmax[0]
     
     cdef vector[int] vCj;
     cdef vector[double] vCx;
 
-    sparse_dot_free_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx)
-    
-    cdef np.npy_intp nnz = Cp[n_row]
-    cdef np.ndarray[np.int32_t, ndim=1] c_indices = np.PyArray_SimpleNewFromData(1, &nnz, np.NPY_INT32, vCj.data())
-    PyArray_ENABLEFLAGS(c_indices, np.NPY_OWNDATA)
-    cdef np.ndarray[np.double_t, ndim=1] c_data = np.PyArray_SimpleNewFromData(1, &nnz, np.NPY_DOUBLE, vCx.data())
-    PyArray_ENABLEFLAGS(c_data, np.NPY_OWNDATA)
+    sparse_dot_free_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx, n_minmax)
     
-    # cdef const char *name_vCj_capsule = "vCj"
-    # cdef int* vCj_data = vCj.data()
-    # vCj_capsule = PyCapsule_New(<void *> vCj_data, name_vCj_capsule, &free_ptr)
-    # if not PyCapsule_IsValid(vCj_capsule, name_vCj_capsule):
-        # raise ValueError(f"invalid pointer ({name_vCj_capsule}) to parameters")
-        #
-    # cdef const char *name_vCx_capsule = "vCx"
-    # cdef double* vCx_data = vCx.data()
-    # vCx_capsule = PyCapsule_New(<void *> vCx_data, name_vCx_capsule, &free_ptr)
-    # if not PyCapsule_IsValid(vCx_capsule, name_vCx_capsule):
-        # raise ValueError(f"invalid pointer ({name_vCx_capsule}) to parameters")
+    c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0)
+    c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0)
     
-    return c_indices, c_data
-
-
-cpdef sparse_dot_only_minmax_topn(
-        int n_row,
-        int n_col,
-        np.ndarray[int, ndim=1] a_indptr,
-        np.ndarray[int, ndim=1] a_indices,
-        np.ndarray[int, ndim=1] b_indptr,
-        np.ndarray[int, ndim=1] b_indices,
-        np.ndarray[int, ndim=1] minmax_topn
-    ):
+    return c_indices, c_data, nminmax[0]
+
+
+cpdef sparse_dot_only_max_nnz_col(
+                                    int n_row,
+                                    int n_col,
+                                    np.ndarray[int, ndim=1] a_indptr,
+                                    np.ndarray[int, ndim=1] a_indices,
+                                    np.ndarray[int, ndim=1] b_indptr,
+                                    np.ndarray[int, ndim=1] b_indices,
+                                    np.ndarray[int, ndim=1] minmax_topn
+                                ):
     """
     Cython glue function to call sparse_dot_only_minmax_topn C++ implementation
     This function will return the maximum number of columns set
@@ -228,5 +285,5 @@ cpdef sparse_dot_only_minmax_topn(
     cdef int* Bj = &b_indices[0]
     cdef int* o_minmax_topn = &minmax_topn[0]
 
-    sparse_dot_only_minmax_topn_source(n_row, n_col, Ap, Aj, Bp, Bj, o_minmax_topn)
-    return
\ No newline at end of file
+    sparse_dot_only_max_nnz_col_source(n_row, n_col, Ap, Aj, Bp, Bj, o_minmax_topn)
+    return
diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/sparse_dot_topn/sparse_dot_topn_parallel.cpp
index c2b9a0b9..d941248e 100644
--- a/sparse_dot_topn/sparse_dot_topn_parallel.cpp
+++ b/sparse_dot_topn/sparse_dot_topn_parallel.cpp
@@ -23,355 +23,563 @@
 #include <vector>
 #include <limits>
 #include <algorithm>
+#include <numeric>
 #include <thread>
 #include <iostream>
 
 #include "./sparse_dot_topn_source.h"
 #include "./sparse_dot_topn_parallel.h"
 
-void inner_sparse_function(int start_row, int end_row, int n_col_inner,
-                            int ntop_inner, double lower_bound_inner, int Ap_copy[],
-                            int Aj_copy[], double Ax_copy[], int Bp_copy[], int Bj_copy[],
-                            double Bx_copy[], std::vector<candidate> real_candidates[])
-{
-
-std::vector<int> next(n_col_inner,-1);
-std::vector<double> sums(n_col_inner, 0);
 
-std::vector<candidate> temp_candidates;
+void distribute_load(
+		int load_sz,
+		int n_jobs,
+		std::vector<std::vector<int>> &ranges
+)
+{
+    // share the load among jobs:
+    int equal_job_load_sz = load_sz/n_jobs;
+	int rem = load_sz % n_jobs;
+	ranges.resize(n_jobs);
 
-int iterations_count = 0;
+    int start = 0;
+	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
+	    std::vector<int> temp_vector(2, 0);
 
-for(int i = start_row; i < end_row; i++){
+	    temp_vector[0] = start;
+	    temp_vector[1] = start + equal_job_load_sz + ((job_nr < rem)? 1 : 0);
+	    start = temp_vector[1];
 
-    iterations_count += 1;
+	    ranges[job_nr] = temp_vector;
+	}
+}
 
-    int head   = -2;
-    int length =  0;
+void inner_gather_function(
+		int start_row,
+		int end_row,
+		int Cp[],
+		int Cp_start,
+		int vCj_start[],
+		double vCx_start[],
+		std::vector<candidate> real_candidates[]
+)
+{
+	int Cp_i = Cp_start;
+	int* vCj_cursor = &vCj_start[Cp_start];
+	double* vCx_cursor = &vCx_start[Cp_start];
+	candidate c;
+	for (int i = start_row; i < end_row; i++){
+		Cp_i += (int) real_candidates[i].size();
+		Cp[i + 1] = Cp_i;
+		for (unsigned int j = 0; j < real_candidates[i].size(); j++){
+			c = real_candidates[i][j];
+			*(vCj_cursor++) = c.index;
+			*(vCx_cursor++) = c.value;
+		}
+		real_candidates[i].clear();
+	}
+}
 
-    int jj_start = Ap_copy[i];
-    int jj_end   = Ap_copy[i+1];
+void inner_sparse_dot_topn(
+		int start_row,
+		int end_row,
+		int n_col_inner,
+        int ntop_inner,
+		double lower_bound_inner,
+		int Ap_copy[],
+        int Aj_copy[],
+		double Ax_copy[],
+		int Bp_copy[],
+		int Bj_copy[],
+        double Bx_copy[],
+		std::vector<candidate> real_candidates[],
+		int* total
+)
+{
+	std::vector<int> next(n_col_inner,-1);
+	std::vector<double> sums(n_col_inner, 0);
 
-    for(int jj = jj_start; jj < jj_end; jj++){
-        int j = Aj_copy[jj];
-        double v = Ax_copy[jj]; //value of A in (i,j)
+	std::vector<candidate> temp_candidates;
 
-        int kk_start = Bp_copy[j];
-        int kk_end   = Bp_copy[j+1];
-        for(int kk = kk_start; kk < kk_end; kk++){
-            int k = Bj_copy[kk]; //kth column of B in row j
+	for(int i = start_row; i < end_row; i++){
 
-            sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
+		int head   = -2;
+		int length =  0;
 
-            if(next[k] == -1){
-                next[k] = head; //keep a linked list, every element points to the next column index
-                head  = k;
-                length++;
-            }
-        }
-    }
+		int jj_start = Ap_copy[i];
+		int jj_end   = Ap_copy[i+1];
 
-    for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
+		for(int jj = jj_start; jj < jj_end; jj++){
+			int j = Aj_copy[jj];
+			double v = Ax_copy[jj]; //value of A in (i,j)
 
-        if(sums[head] > lower_bound_inner){ //append the nonzero elements
-            candidate c;
-            c.index = head;
-            c.value = sums[head];
-            temp_candidates.push_back(c);
-        }
+			int kk_start = Bp_copy[j];
+			int kk_end   = Bp_copy[j+1];
+			for(int kk = kk_start; kk < kk_end; kk++){
+				int k = Bj_copy[kk]; //kth column of B in row j
 
-        int temp = head;
-        head = next[head]; //iterate over columns
+				sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
 
-        next[temp] = -1; //clear arrays
-        sums[temp] =  0; //clear arrays
-    }
+				if(next[k] == -1){
+					next[k] = head; //keep a linked list, every element points to the next column index
+					head  = k;
+					length++;
+				}
+			}
+		}
 
-    int len = (int)temp_candidates.size();
-    if (len > ntop_inner){
-        std::partial_sort(temp_candidates.begin(),
-                            temp_candidates.begin()+ntop_inner,
-                            temp_candidates.end(),
-                            candidate_cmp);
-        len = ntop_inner;
-    } else {
-        std::sort(temp_candidates.begin(),
-                    temp_candidates.end(), candidate_cmp);
-    }
+		for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
 
+			if(sums[head] > lower_bound_inner){ //append the nonzero elements
+				candidate c;
+				c.index = head;
+				c.value = sums[head];
+				temp_candidates.push_back(c);
+			}
 
-    temp_candidates.resize(len);
-    real_candidates[i] = temp_candidates;
+			int temp = head;
+			head = next[head]; //iterate over columns
 
-    temp_candidates.clear();
+			next[temp] = -1; //clear arrays
+			sums[temp] =  0; //clear arrays
+		}
 
-}
+		int len = (int)temp_candidates.size();
+		if (len > ntop_inner){
+			std::partial_sort(temp_candidates.begin(),
+								temp_candidates.begin()+ntop_inner,
+								temp_candidates.end(),
+								candidate_cmp);
+			len = ntop_inner;
+		}
+		else {
+			std::sort(temp_candidates.begin(),
+						temp_candidates.end(), candidate_cmp);
+		}
 
+		(*total) += len;
+		temp_candidates.resize(len);
+		real_candidates[i].swap(temp_candidates);
+		real_candidates[i].shrink_to_fit();
+	}
 }
 
-void sparse_dot_topn_parallel(int n_row,
-                        int n_col,
-                        int Ap[],
-                        int Aj[],
-                        double Ax[], //data of A
-                        int Bp[],
-                        int Bj[],
-                        double Bx[], //data of B
-                        int ntop,
-                        double lower_bound,
-                        int Cp[],
-                        int Cj[],
-                        double Cx[],
-                        int n_jobs)
+void sparse_dot_topn_parallel(
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		double Ax[], //data of A
+		int Bp[],
+		int Bj[],
+		double Bx[], //data of B
+		int ntop,
+		double lower_bound,
+		int Cp[],
+		int Cj[],
+		double Cx[],
+		int n_jobs
+)
 {
-
-    Cp[0] = 0;
-
-	int split_amount = n_row / n_jobs;
-
-	std::vector<std::vector<int>> split_row_vector(n_jobs);
-
 	std::vector<std::vector<candidate>> real_candidates(n_row);
-
 	std::vector<candidate> *real_cand_pointer;
 	real_cand_pointer = &real_candidates[0];
 
-	std::vector<std::thread> thread_list(n_jobs);
-
 
-	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
-	    std::vector<int> temp_vector(2, 0);
+	std::vector<std::vector<int>> split_row_vector(n_jobs);
+    distribute_load(n_row, n_jobs, split_row_vector);
 
-	    int start_split = job_nr * split_amount;
-	    int end_split = start_split + split_amount;
+	// initialize aggregate:
+	std::vector<int> sub_total(n_jobs, 0);
 
-	    if (job_nr == n_jobs -1) {
-	        end_split = n_row;
-	    }
+	std::vector<std::thread> thread_list(n_jobs);
+	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
 
-	    temp_vector[0] = start_split;
-	    temp_vector[1] = end_split;
+	    int start_row = split_row_vector[job_nr][0];
+	    int end_row = split_row_vector[job_nr][1];
 
-	    split_row_vector[job_nr] = temp_vector;
+	    thread_list[job_nr] = std::thread(
+	    		inner_sparse_dot_topn,
+				start_row, end_row,
+				n_col, ntop,
+				lower_bound,
+				Ap, Aj, Ax, Bp, Bj, Bx,
+				real_cand_pointer,
+				&sub_total[job_nr]
+		);
+    }
 
-	}
+    for (int job_nr = 0; job_nr < n_jobs; job_nr++)
+    	thread_list[job_nr].join();
 
+    // gather the results:
+    std::vector<int> start_points(n_jobs + 1);
+    start_points[0] = 0;
+    std::partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1);
 
+    Cp[0] = 0;
 	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
 
-
 	    int start_row = split_row_vector[job_nr][0];
 	    int end_row = split_row_vector[job_nr][1];
 
+	    thread_list[job_nr] = std::thread(
+	    		inner_gather_function,
+	    		start_row, end_row,
+	    		Cp,
+				start_points[job_nr],
+				Cj,
+				Cx,
+				real_cand_pointer
+		);
+    }
 
-	    thread_list[job_nr] = std::thread (inner_sparse_function, start_row,
-	                                        end_row, n_col, ntop, lower_bound,
-	                                        Ap, Aj, Ax, Bp, Bj, Bx,
-	                                        real_cand_pointer);
+    for (int job_nr = 0; job_nr < n_jobs; job_nr++)
+    	thread_list[job_nr].join();
 
-    }
+}
 
-    for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
-        thread_list[job_nr].join();
-    }
+void inner_sparse_dot_topn_extd(
+		int start_row,
+		int end_row,
+		int n_col_inner,
+		int ntop_inner,
+		double lower_bound_inner,
+		int Ap_copy[],
+		int Aj_copy[],
+		double Ax_copy[],
+		int Bp_copy[],
+		int Bj_copy[],
+		double Bx_copy[],
+		std::vector<candidate> real_candidates[],
+		int* total,
+		int* n_minmax
+)
+{
 
-    int nnz = 0;
+	std::vector<int> next(n_col_inner,-1);
+	std::vector<double> sums(n_col_inner, 0);
 
-    for (int m = 0; m < n_row; m++) {
+	std::vector<candidate> temp_candidates;
 
-        std::vector<candidate> cand = real_cand_pointer[m];
+	int iterations_count = 0;
 
-        int can_len = (int)cand.size();
+	for(int i = start_row; i < end_row; i++){
 
-        for(int can_nr=0; can_nr < can_len; can_nr++){
-            Cj[nnz] = cand[can_nr].index;
-            Cx[nnz] = cand[can_nr].value;
-            nnz++;
-        }
+		iterations_count += 1;
 
-        Cp[m+1] = nnz;
+		int head   = -2;
+		int length =  0;
 
-    }
+		int jj_start = Ap_copy[i];
+		int jj_end   = Ap_copy[i+1];
 
-}
+		for(int jj = jj_start; jj < jj_end; jj++){
+			int j = Aj_copy[jj];
+			double v = Ax_copy[jj]; //value of A in (i,j)
 
-void inner_sparse_minmax_function(int start_row, int end_row, int n_col_inner,
-                            int ntop_inner, double lower_bound_inner, int Ap_copy[],
-                            int Aj_copy[], double Ax_copy[], int Bp_copy[], int Bj_copy[],
-                            double Bx_copy[], std::vector<candidate> real_candidates[],
-							int *minmax_ntop)
-{
+			int kk_start = Bp_copy[j];
+			int kk_end   = Bp_copy[j+1];
+			for(int kk = kk_start; kk < kk_end; kk++){
+				int k = Bj_copy[kk]; //kth column of B in row j
 
-std::vector<int> next(n_col_inner,-1);
-std::vector<double> sums(n_col_inner, 0);
+				sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
 
-std::vector<candidate> temp_candidates;
+				if(next[k] == -1){
+					next[k] = head; //keep a linked list, every element points to the next column index
+					head  = k;
+					length++;
+				}
+			}
+		}
 
-int iterations_count = 0;
+		for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
 
-for(int i = start_row; i < end_row; i++){
+			if(sums[head] > lower_bound_inner){ //append the nonzero elements
+				candidate c;
+				c.index = head;
+				c.value = sums[head];
+				temp_candidates.push_back(c);
+			}
 
-    iterations_count += 1;
+			int temp = head;
+			head = next[head]; //iterate over columns
 
-    int head   = -2;
-    int length =  0;
+			next[temp] = -1; //clear arrays
+			sums[temp] =  0; //clear arrays
+		}
 
-    int jj_start = Ap_copy[i];
-    int jj_end   = Ap_copy[i+1];
+		int len = (int)temp_candidates.size();
+		*n_minmax = (len > *n_minmax)? len : *n_minmax;
+		if (len > ntop_inner){
+			std::partial_sort(temp_candidates.begin(),
+								temp_candidates.begin()+ntop_inner,
+								temp_candidates.end(),
+								candidate_cmp);
+			len = ntop_inner;
+		}
+		else {
+			std::sort(temp_candidates.begin(),
+						temp_candidates.end(), candidate_cmp);
+		}
 
-    for(int jj = jj_start; jj < jj_end; jj++){
-        int j = Aj_copy[jj];
-        double v = Ax_copy[jj]; //value of A in (i,j)
+		(*total) += len;
+		temp_candidates.resize(len);
+		real_candidates[i].swap(temp_candidates);
+		real_candidates[i].shrink_to_fit();
+	}
+}
 
-        int kk_start = Bp_copy[j];
-        int kk_end   = Bp_copy[j+1];
-        for(int kk = kk_start; kk < kk_end; kk++){
-            int k = Bj_copy[kk]; //kth column of B in row j
+void sparse_dot_topn_extd_parallel(
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		double Ax[], //data of A
+		int Bp[],
+		int Bj[],
+		double Bx[], //data of B
+		int ntop,
+		double lower_bound,
+		int Cp[],
+		int Cj[],
+		double Cx[],
+		int *n_minmax,
+		int n_jobs
+)
+{
+	std::vector<std::vector<int>> split_row_vector(n_jobs);
+    distribute_load(n_row, n_jobs, split_row_vector);
 
-            sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
+	std::vector<std::vector<candidate>> real_candidates(n_row);
+	std::vector<candidate> *real_cand_pointer;
+	real_cand_pointer = &real_candidates[0];
 
-            if(next[k] == -1){
-                next[k] = head; //keep a linked list, every element points to the next column index
-                head  = k;
-                length++;
-            }
-        }
-    }
-    *minmax_ntop = (length > *minmax_ntop)? length : *minmax_ntop;
+	// initialize aggregates:
+	std::vector<int> sub_total(n_jobs, 0);
+    std::vector<int> split_n_minmax(n_jobs, 0);
 
-    for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
+    std::vector<std::thread> thread_list(n_jobs);
 
-        if(sums[head] > lower_bound_inner){ //append the nonzero elements
-            candidate c;
-            c.index = head;
-            c.value = sums[head];
-            temp_candidates.push_back(c);
-        }
+	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
 
-        int temp = head;
-        head = next[head]; //iterate over columns
+	    int start_row = split_row_vector[job_nr][0];
+	    int end_row = split_row_vector[job_nr][1];
 
-        next[temp] = -1; //clear arrays
-        sums[temp] =  0; //clear arrays
+	    thread_list[job_nr] = std::thread(
+	    		inner_sparse_dot_topn_extd,
+				start_row, end_row,
+				n_col, ntop,
+				lower_bound,
+				Ap, Aj, Ax, Bp, Bj, Bx,
+				real_cand_pointer,
+				&sub_total[job_nr],
+				&split_n_minmax[job_nr]
+		);
     }
 
-    int len = (int)temp_candidates.size();
-    if (len > ntop_inner){
-        std::partial_sort(temp_candidates.begin(),
-                            temp_candidates.begin()+ntop_inner,
-                            temp_candidates.end(),
-                            candidate_cmp);
-        len = ntop_inner;
-    } else {
-        std::sort(temp_candidates.begin(),
-                    temp_candidates.end(), candidate_cmp);
-    }
+    for (int job_nr = 0; job_nr < n_jobs; job_nr++)
+    	thread_list[job_nr].join();
 
+    // gather the results:
+    *n_minmax = *std::max_element(split_n_minmax.begin(), split_n_minmax.end());
 
-    temp_candidates.resize(len);
-    real_candidates[i] = temp_candidates;
+    std::vector<int> start_points(n_jobs + 1);
+    start_points[0] = 0;
+    std::partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1);
+
+    Cp[0] = 0;
+	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
 
-    temp_candidates.clear();
+	    int start_row = split_row_vector[job_nr][0];
+	    int end_row = split_row_vector[job_nr][1];
 
-}
+	    thread_list[job_nr] = std::thread(
+	    		inner_gather_function,
+	    		start_row, end_row,
+	    		Cp,
+				start_points[job_nr],
+				Cj,
+				Cx,
+				real_cand_pointer
+		);
+    }
+
+    for (int job_nr = 0; job_nr < n_jobs; job_nr++)
+    	thread_list[job_nr].join();
 
 }
 
-void sparse_dot_plus_minmax_topn_parallel(int n_row,
-                        int n_col,
-                        int Ap[],
-                        int Aj[],
-                        double Ax[], //data of A
-                        int Bp[],
-                        int Bj[],
-                        double Bx[], //data of B
-                        int ntop,
-                        double lower_bound,
-                        int Cp[],
-                        int Cj[],
-                        double Cx[],
-						int *minmax_ntop,
-                        int n_jobs)
+void inner_sparse_dot_free(
+		int start_row,
+		int end_row,
+		int n_col_inner,
+        double lower_bound_inner,
+		int Ap_copy[],
+        int Aj_copy[],
+		double Ax_copy[],
+		int Bp_copy[],
+		int Bj_copy[],
+        double Bx_copy[],
+		std::vector<candidate> real_candidates[],
+		int* total,
+		int* n_minmax
+)
 {
 
-    Cp[0] = 0;
+	std::vector<int> next(n_col_inner,-1);
+	std::vector<double> sums(n_col_inner, 0);
 
-	int split_amount = n_row / n_jobs;
+	std::vector<candidate> temp_candidates;
 
-	std::vector<std::vector<int>> split_row_vector(n_jobs);
+	for(int i = start_row; i < end_row; i++){
 
-	std::vector<std::vector<candidate>> real_candidates(n_row);
+		int head   = -2;
+		int length =  0;
 
-	std::vector<candidate> *real_cand_pointer;
-	real_cand_pointer = &real_candidates[0];
+		int jj_start = Ap_copy[i];
+		int jj_end   = Ap_copy[i+1];
 
-    std::vector<int> split_minmax_ntop(n_jobs, 0);
+		for(int jj = jj_start; jj < jj_end; jj++){
+			int j = Aj_copy[jj];
+			double v = Ax_copy[jj]; //value of A in (i,j)
 
-    std::vector<std::thread> thread_list(n_jobs);
+			int kk_start = Bp_copy[j];
+			int kk_end   = Bp_copy[j+1];
+			for(int kk = kk_start; kk < kk_end; kk++){
+				int k = Bj_copy[kk]; //kth column of B in row j
 
+				sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
 
-	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
-	    std::vector<int> temp_vector(2, 0);
+				if(next[k] == -1){
+					next[k] = head; //keep a linked list, every element points to the next column index
+					head  = k;
+					length++;
+				}
+			}
+		}
 
-	    int start_split = job_nr * split_amount;
-	    int end_split = start_split + split_amount;
+		for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
 
-	    if (job_nr == n_jobs -1) {
-	        end_split = n_row;
-	    }
+			if(sums[head] > lower_bound_inner){ //append the nonzero elements
+				candidate c;
+				c.index = head;
+				c.value = sums[head];
+				temp_candidates.push_back(c);
+			}
 
-	    temp_vector[0] = start_split;
-	    temp_vector[1] = end_split;
+			int temp = head;
+			head = next[head]; //iterate over columns
 
-	    split_row_vector[job_nr] = temp_vector;
+			next[temp] = -1; //clear arrays
+			sums[temp] =  0; //clear arrays
+		}
 
-	}
 
+		std::sort(temp_candidates.begin(),
+					temp_candidates.end(), candidate_cmp);
 
-	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
+		int len = (int) temp_candidates.size();
+		(*total) += len;
+		*n_minmax = (len > *n_minmax)? len : *n_minmax;
+		real_candidates[i].swap(temp_candidates);
+		real_candidates[i].shrink_to_fit();
+	}
+}
 
+void sparse_dot_free_parallel(
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		double Ax[], //data of A
+		int Bp[],
+		int Bj[],
+		double Bx[], //data of B
+		double lower_bound,
+		int Cp[],
+		std::vector<int>* vCj,
+		std::vector<double>* vCx,
+		int* n_minmax,
+		int n_jobs
+)
+{
+	std::vector<std::vector<int>> split_row_vector(n_jobs);
+    distribute_load(n_row, n_jobs, split_row_vector);
 
-	    int start_row = split_row_vector[job_nr][0];
-	    int end_row = split_row_vector[job_nr][1];
+	std::vector<std::vector<candidate>> real_candidates(n_row);
+	std::vector<candidate> *real_cand_pointer;
+	real_cand_pointer = &real_candidates[0];
 
+	// initialize aggregates:
+	std::vector<int> sub_total(n_jobs, 0);
+    std::vector<int> split_n_minmax(n_jobs, 0);
 
-	    thread_list[job_nr] = std::thread (inner_sparse_minmax_function, start_row,
-	                                        end_row, n_col, ntop, lower_bound,
-	                                        Ap, Aj, Ax, Bp, Bj, Bx,
-	                                        real_cand_pointer,
-											&split_minmax_ntop[job_nr]);
+    // execute the jobs:
+	std::vector<std::thread> thread_list(n_jobs);
+	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
 
-    }
+	    int start_row = split_row_vector[job_nr][0];
+	    int end_row = split_row_vector[job_nr][1];
 
-    for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
-        thread_list[job_nr].join();
+	    thread_list[job_nr] = std::thread (
+	    		inner_sparse_dot_free,
+	    		start_row, end_row,
+				n_col,
+				lower_bound,
+	            Ap, Aj, Ax, Bp, Bj, Bx,
+	            real_cand_pointer,
+				&sub_total[job_nr],
+				&split_n_minmax[job_nr]
+		);
     }
 
-    int nnz = 0;
+    for (int job_nr = 0; job_nr < n_jobs; job_nr++)
+    	thread_list[job_nr].join();
 
-    for (int m = 0; m < n_row; m++) {
+    // gather the results (in parallel):
+    *n_minmax = *std::max_element(split_n_minmax.begin(), split_n_minmax.end());
 
-        std::vector<candidate> cand = real_cand_pointer[m];
+    std::vector<int> start_points(n_jobs + 1);
+    start_points[0] = 0;
+    std::partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1);
 
-        int can_len = (int)cand.size();
+    int total = start_points.back();
+    vCj->resize(total);
+    vCx->resize(total);
 
-        for(int can_nr=0; can_nr < can_len; can_nr++){
-            Cj[nnz] = cand[can_nr].index;
-            Cx[nnz] = cand[can_nr].value;
-            nnz++;
-        }
+    Cp[0] = 0;
+	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
 
-        Cp[m+1] = nnz;
+	    int start_row = split_row_vector[job_nr][0];
+	    int end_row = split_row_vector[job_nr][1];
 
+	    thread_list[job_nr] = std::thread(
+	    		inner_gather_function,
+	    		start_row, end_row,
+	    		Cp,
+				start_points[job_nr],
+				&((*vCj)[0]),
+				&((*vCx)[0]),
+				real_cand_pointer
+		);
     }
-    *minmax_ntop = *std::max_element(split_minmax_ntop.begin(), split_minmax_ntop.end());
+
+    for (int job_nr = 0; job_nr < n_jobs; job_nr++)
+    	thread_list[job_nr].join();
 
 }
 
-void inner_sparse_only_minmax_function(int start_row, int end_row, int n_col_inner,
-									   int Ap_copy[], int Aj_copy[],
-									   int Bp_copy[], int Bj_copy[],
-									   int *minmax_ntop)
+void inner_sparse_only_max_nnz_col(
+		int start_row,
+		int end_row,
+		int n_col_inner,
+		int Ap_copy[],
+		int Aj_copy[],
+		int Bp_copy[],
+		int Bj_copy[],
+		int *max_nnz_col	// already initialized to 0
+)
 {
 	std::vector<bool> unmarked(n_col_inner, true);
 
@@ -396,55 +604,44 @@ void inner_sparse_only_minmax_function(int start_row, int end_row, int n_col_inn
 				}
 			}
 		}
-		*minmax_ntop = (length > *minmax_ntop)? length : *minmax_ntop;
+		*max_nnz_col = (length > *max_nnz_col)? length : *max_nnz_col;
 	}
 }
 
-void sparse_dot_only_minmax_topn_parallel(int n_row,
-										  int n_col,
-										  int Ap[],
-										  int Aj[],
-										  int Bp[],
-										  int Bj[],
-										  int *minmax_ntop,
-										  int n_jobs)
+void sparse_dot_only_max_nnz_col_parallel(
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		int Bp[],
+		int Bj[],
+		int *max_nnz_col,
+		int n_jobs
+)
 {
-	std::vector<int> job_load_sz(n_jobs, n_row/n_jobs);
-
-	int rem = n_row % n_jobs;
-	for (int r = 0; r < rem; r++) job_load_sz[r] += 1;
-
 	std::vector<std::vector<int>> split_row_vector(n_jobs);
+    distribute_load(n_row, n_jobs, split_row_vector);
 
-    std::vector<int> split_minmax_ntop(n_jobs, 0);
-
+    std::vector<int> split_max_nnz_col(n_jobs, 0);
     std::vector<std::thread> thread_list(n_jobs);
-
-    int start = 0;
-	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
-	    std::vector<int> temp_vector(2, 0);
-
-	    temp_vector[0] = start;
-	    temp_vector[1] = start + job_load_sz[job_nr];
-	    start = temp_vector[1];
-
-	    split_row_vector[job_nr] = temp_vector;
-	}
-
 	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
 
-
 	    int start_row = split_row_vector[job_nr][0];
 	    int end_row = split_row_vector[job_nr][1];
 
-	    thread_list[job_nr] = std::thread (inner_sparse_only_minmax_function,
-	    									start_row, end_row, n_col,
-	                                        Ap, Aj, Bp, Bj,
-											&split_minmax_ntop[job_nr]);
+	    thread_list[job_nr] = std::thread (
+	    		inner_sparse_only_max_nnz_col,
+	    		start_row, end_row,
+				n_col,
+				Ap, Aj, Bp, Bj,
+				&split_max_nnz_col[job_nr]
+		);
 
     }
 
-    for (int job_nr = 0; job_nr < n_jobs; job_nr++) thread_list[job_nr].join();
+    for (int job_nr = 0; job_nr < n_jobs; job_nr++)
+    	thread_list[job_nr].join();
 
-    *minmax_ntop = *std::max_element(split_minmax_ntop.begin(), split_minmax_ntop.end());
+    *max_nnz_col = *std::max_element(split_max_nnz_col.begin(), split_max_nnz_col.end());
 }
+
diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.h b/sparse_dot_topn/sparse_dot_topn_parallel.h
index cb43cd1c..30dc24ef 100644
--- a/sparse_dot_topn/sparse_dot_topn_parallel.h
+++ b/sparse_dot_topn/sparse_dot_topn_parallel.h
@@ -23,44 +23,67 @@
 #ifndef UTILS_CPPCLASS_H
 #define UTILS_CPPCLASS_H
 
-extern void sparse_dot_topn_parallel(int n_row,
-      	              int n_col,
-      	              int Ap[],
-      	              int Aj[],
-      	              double Ax[],
-      	              int Bp[],
-      	              int Bj[],
-      	              double Bx[],
-                      int ntop,
-                      double lower_bound,
-      	                    int Cp[],
-      	                    int Cj[],
-      	                    double Cx[],
-      	                    int n_jobs);
+extern void sparse_dot_topn_parallel(
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		double Ax[],
+		int Bp[],
+		int Bj[],
+		double Bx[],
+		int ntop,
+		double lower_bound,
+		int Cp[],
+		int Cj[],
+		double Cx[],
+		int n_jobs
+);
 
-extern void sparse_dot_plus_minmax_topn_parallel(int n_row,
-      	              int n_col,
-      	              int Ap[],
-      	              int Aj[],
-      	              double Ax[],
-      	              int Bp[],
-      	              int Bj[],
-      	              double Bx[],
-                      int ntop,
-                      double lower_bound,
-      	                    int Cp[],
-      	                    int Cj[],
-      	                    double Cx[],
-							int* minmax_topn,
-      	                    int n_jobs);
+extern void sparse_dot_topn_extd_parallel(
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		double Ax[],
+		int Bp[],
+		int Bj[],
+		double Bx[],
+		int ntop,
+		double lower_bound,
+		int Cp[],
+		int Cj[],
+		double Cx[],
+		int* n_minmax,
+		int n_jobs
+);
 
-extern void sparse_dot_only_minmax_topn_parallel(int n_row,
-										  int n_col,
-										  int Ap[],
-										  int Aj[],
-										  int Bp[],
-										  int Bj[],
-										  int *minmax_ntop,
-										  int n_jobs);
+extern void sparse_dot_free_parallel(
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		double Ax[], //data of A
+		int Bp[],
+		int Bj[],
+		double Bx[], //data of B
+		double lower_bound,
+		int Cp[],
+		std::vector<int>* Cj,
+		std::vector<double>* Cx,
+		int* n_minmax,
+		int njobs
+);
+
+extern void sparse_dot_only_max_nnz_col_parallel(
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		int Bp[],
+		int Bj[],
+		int *max_nnz_col,
+		int n_jobs
+);
 
 #endif //UTILS_CPPCLASS_H
diff --git a/sparse_dot_topn/sparse_dot_topn_source.cpp b/sparse_dot_topn/sparse_dot_topn_source.cpp
index c4544790..88abbd6a 100644
--- a/sparse_dot_topn/sparse_dot_topn_source.cpp
+++ b/sparse_dot_topn/sparse_dot_topn_source.cpp
@@ -49,19 +49,21 @@ bool candidate_cmp(candidate c_i, candidate c_j) { return (c_i.value > c_j.value
 
     N.B. A and B must be CSR format!!!
 */
-void sparse_dot_topn_source(int n_row,
-                        int n_col,
-                        int Ap[],
-                        int Aj[],
-                        double Ax[], //data of A
-                        int Bp[],
-                        int Bj[],
-                        double Bx[], //data of B
-                        int ntop,
-                        double lower_bound,
-                        int Cp[],
-                        int Cj[],
-                        double Cx[])
+void sparse_dot_topn_source(
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		double Ax[], //data of A
+		int Bp[],
+		int Bj[],
+		double Bx[], //data of B
+		int ntop,
+		double lower_bound,
+		int Cp[],
+		int Cj[],
+		double Cx[]
+)
 {
     std::vector<int> next(n_col,-1);
     std::vector<double> sums(n_col, 0);
@@ -133,10 +135,12 @@ void sparse_dot_topn_source(int n_row,
 }
 
 /*
-    C++ implementation of sparse_dot_source
+    C++ implementation of sparse_dot_topn_extd_source
 
     This function will return a matrix C in CSR format, where
-    C = [all results > lower_bound sorted for each row of A * B].
+    C = [sorted top n results > lower_bound for each row of A * B].
+    The maximum number n_minmax of elements per row of C (assuming ntop = n_col)
+    is also returned.
 
     Input:
         n_row: number of rows of A matrix
@@ -145,37 +149,41 @@ void sparse_dot_topn_source(int n_row,
         Ap, Aj, Ax: CSR expression of A matrix
         Bp, Bj, Bx: CSR expression of B matrix
 
-        memory_bound: the maximum number of elements per row of C
+        ntop: n top results
         lower_bound: a threshold that the element of A*B must greater than
 
     Output by reference:
         Cp, Cj, Cx: CSR expression of C matrix
+        n_minmax: The maximum number of elements per row of C (assuming ntop = n_col)
 
     N.B. A and B must be CSR format!!!
 */
-void sparse_dot_source(int n_row,
-									int n_col,
-									int Ap[],
-									int Aj[],
-									double Ax[], //data of A
-									int Bp[],
-									int Bj[],
-									double Bx[], //data of B
-									int memory_bound,
-									double lower_bound,
-									int Cp[],
-									int Cj[],
-									double Cx[])
+void sparse_dot_topn_extd_source(
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		double Ax[],	//data of A
+		int Bp[],
+		int Bj[],
+		double Bx[],	//data of B
+		int ntop,
+		double lower_bound,
+		int Cp[],
+		int Cj[],
+		double Cx[], 	//data of C
+		int* n_minmax
+)
 {
     std::vector<int> next(n_col,-1);
     std::vector<double> sums(n_col, 0);
 
     std::vector<candidate> candidates;
-    candidates.reserve(memory_bound);
 
     int nnz = 0;
 
     Cp[0] = 0;
+    *n_minmax = 0;
 
     for(int i = 0; i < n_row; i++){
         int head   = -2;
@@ -219,7 +227,13 @@ void sparse_dot_source(int n_row,
         }
 
         int len = (int)candidates.size();
-        std::sort(candidates.begin(), candidates.end(), candidate_cmp);
+        *n_minmax = (len > *n_minmax)? len : *n_minmax;
+        if (len > ntop){
+            std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp);
+            len = ntop;
+        } else {
+            std::sort(candidates.begin(), candidates.end(), candidate_cmp);
+        }
 
         for(int a=0; a < len; a++){
             Cj[nnz] = candidates[a].index;
@@ -237,6 +251,7 @@ void sparse_dot_source(int n_row,
 
     This function will return a matrix C in CSR format, where
     C = [all results > lower_bound sorted for each row of A * B].
+    It also returns the maximum number of elements per row of C.
 
     Input:
         n_row: number of rows of A matrix
@@ -250,24 +265,29 @@ void sparse_dot_source(int n_row,
 
     Output by reference:
         Cp: C array for idx_pointer of CSR expression of C matrix
-        Cj: numpy array for indices of CSR expression of C matrix
-        Cx: numpy array for data values of CSR expression of C matrix
+        Cj: STL vector for indices of CSR expression of C matrix
+        Cx: STL vector for data values of CSR expression of C matrix
+        n_minmax: the maximum number of elements per row of C
 
     N.B. A and B must be CSR format!!!
 */
-void sparse_dot_free_source(int n_row,
-									int n_col,
-									int Ap[],
-									int Aj[],
-									double Ax[], //data of A
-									int Bp[],
-									int Bj[],
-									double Bx[], //data of B
-									double lower_bound,
-									int Cp[],
-									std::vector<int>* Cj,
-									std::vector<double>* Cx)
+void sparse_dot_free_source(
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		double Ax[], //data of A
+		int Bp[],
+		int Bj[],
+		double Bx[], //data of B
+		double lower_bound,
+		int Cp[],
+		std::vector<int>* Cj,
+		std::vector<double>* Cx,
+		int* n_minmax
+)
 {
+	*n_minmax = 0;
 	int sz = std::max(n_row, n_col);
 	Cj->reserve(sz);
 	Cx->reserve(sz);
@@ -321,6 +341,7 @@ void sparse_dot_free_source(int n_row,
         }
 
         int len = (int)candidates.size();
+        *n_minmax = (len > *n_minmax)? len : *n_minmax;
         std::sort(candidates.begin(), candidates.end(), candidate_cmp);
 
         for(int a=0; a < len; a++){
@@ -358,17 +379,19 @@ void sparse_dot_free_source(int n_row,
 
     N.B. A and B must be CSR format!!!
 */
-void sparse_dot_nnz_source(int n_row,
-									int n_col,
-									int Ap[],
-									int Aj[],
-									double Ax[], //data of A
-									int Bp[],
-									int Bj[],
-									double Bx[], //data of B
-									double lower_bound,
-									int* nnz,
-									int* ntop)
+void sparse_dot_nnz_source(
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		double Ax[], //data of A
+		int Bp[],
+		int Bj[],
+		double Bx[], //data of B
+		double lower_bound,
+		int* nnz,
+		int* ntop
+)
 {
     std::vector<int> next(n_col,-1);
     std::vector<double> sums(n_col, 0);
@@ -418,7 +441,7 @@ void sparse_dot_nnz_source(int n_row,
 }
 
 /*
-    C++ implementation of sparse_dot_only_minmax_topn_source
+    C++ implementation of sparse_dot_only_max_nnz_col_source
 
     This function will return the maximum number of columns set
     per row over all rows of A * B
@@ -431,22 +454,24 @@ void sparse_dot_nnz_source(int n_row,
         Bp, Bj, Bx: CSR expression of B matrix
 
     Output by reference:
-        minmax_ntop: the maximum number of columns set per row
+        max_nnz_col: the maximum number of columns set per row
                      over all rows of A * B
 
     N.B. A and B must be CSR format!!!
 */
-void sparse_dot_only_minmax_topn_source(int n_row,
-									int n_col,
-									int Ap[],
-									int Aj[],
-									int Bp[],
-									int Bj[],
-									int *minmax_ntop)
+void sparse_dot_only_max_nnz_col_source(
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		int Bp[],
+		int Bj[],
+		int *max_nnz_col
+)
 {
     std::vector<bool> unmarked(n_col, true);
 
-    *minmax_ntop = 0;
+    *max_nnz_col = 0;
 
     for(int i = 0; i < n_row; i++){
         int length =  0;
@@ -467,6 +492,6 @@ void sparse_dot_only_minmax_topn_source(int n_row,
                 }
             }
         }
-        *minmax_ntop = (length > *minmax_ntop)? length : *minmax_ntop;
+        *max_nnz_col = (length > *max_nnz_col)? length : *max_nnz_col;
     }
 }
diff --git a/sparse_dot_topn/sparse_dot_topn_source.h b/sparse_dot_topn/sparse_dot_topn_source.h
index 664378e3..723e9acc 100644
--- a/sparse_dot_topn/sparse_dot_topn_source.h
+++ b/sparse_dot_topn/sparse_dot_topn_source.h
@@ -28,21 +28,41 @@ struct candidate {int index; double value;};
 
 extern bool candidate_cmp(candidate c_i, candidate c_j);
 
-extern void sparse_dot_topn_source(int n_row,
-								   int n_col,
-								   int Ap[],
-								   int Aj[],
-								   double Ax[],	//data of A
-								   int Bp[],
-								   int Bj[],
-								   double Bx[],	//data of B
-								   int ntop,
-								   double lower_bound,
-										int Cp[],
-										int Cj[],
-										double Cx[]);	//data of C
+extern void sparse_dot_topn_source(
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		double Ax[],	//data of A
+		int Bp[],
+		int Bj[],
+		double Bx[],	//data of B
+		int ntop,
+		double lower_bound,
+		int Cp[],
+		int Cj[],
+		double Cx[]		//data of C
+);
+
+extern void sparse_dot_topn_extd_source(
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		double Ax[],	//data of A
+		int Bp[],
+		int Bj[],
+		double Bx[],	//data of B
+		int ntop,
+		double lower_bound,
+		int Cp[],
+		int Cj[],
+		double Cx[], 	//data of C
+		int* n_minmax
+);
 
-extern void sparse_dot_free_source(int n_row,
+extern void sparse_dot_free_source(
+		int n_row,
 		int n_col,
 		int Ap[],
 		int Aj[],
@@ -53,14 +73,18 @@ extern void sparse_dot_free_source(int n_row,
 		double lower_bound,
 		int Cp[],
 		std::vector<int>* Cj,
-		std::vector<double>* Cx);
+		std::vector<double>* Cx,
+		int* n_minmax
+);
 
-extern void sparse_dot_only_minmax_topn_source(int n_row,
-											   int n_col,
-											   int Ap[],
-											   int Aj[],
-											   int Bp[],
-											   int Bj[],
-											   	   int *minmax_ntop);
+extern void sparse_dot_only_max_nnz_col_source(
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		int Bp[],
+		int Bj[],
+		int *max_nnz_col
+);
 
 #endif //UTILS_CPPCLASS_H
diff --git a/sparse_dot_topn/sparse_dot_topn_threaded.pyx b/sparse_dot_topn/sparse_dot_topn_threaded.pyx
index 0bb45a6a..86c347ec 100644
--- a/sparse_dot_topn/sparse_dot_topn_threaded.pyx
+++ b/sparse_dot_topn/sparse_dot_topn_threaded.pyx
@@ -19,70 +19,97 @@
 
 # distutils: language = c++
 
-import numpy as np
+from libcpp.vector cimport vector
+from array_wrappers cimport ArrayWrapper_int, ArrayWrapper_double
+
 cimport numpy as np
+import numpy as np
+
+
+np.import_array()
+
 
 cdef extern from "sparse_dot_topn_parallel.h":
 
     cdef void sparse_dot_topn_parallel(
-                        int n_row,
-                        int n_col,
-                        int Ap[],
-                        int Aj[],
-                        double Ax[],
-                        int Bp[],
-                        int Bj[],
-                        double Bx[],
-                        int topn,
-                        double lower_bound,
-                        int Cp[],
-                        int Cj[],
-                        double Cx[],
-                        int n_jobs);
-
-    cdef void sparse_dot_plus_minmax_topn_parallel(
-                        int n_row,
-                        int n_col,
-                        int Ap[],
-                        int Aj[],
-                        double Ax[],
-                        int Bp[],
-                        int Bj[],
-                        double Bx[],
-                        int topn,
-                        double lower_bound,
-                        int Cp[],
-                        int Cj[],
-                        double Cx[],
-                        int minmax_ntop[],
-                        int n_jobs);
-
-    cdef void sparse_dot_only_minmax_topn_parallel(
-                        int n_row,
-                        int n_col,
-                        int Ap[],
-                        int Aj[],
-                        int Bp[],
-                        int Bj[],
-                        int minmax_ntop[],
-                        int n_jobs);
+                                        int n_row,
+                                        int n_col,
+                                        int Ap[],
+                                        int Aj[],
+                                        double Ax[],
+                                        int Bp[],
+                                        int Bj[],
+                                        double Bx[],
+                                        int topn,
+                                        double lower_bound,
+                                        int Cp[],
+                                        int Cj[],
+                                        double Cx[],
+                                        int n_jobs
+                                    );
+
+    cdef void sparse_dot_topn_extd_parallel(
+                                                int n_row,
+                                                int n_col,
+                                                int Ap[],
+                                                int Aj[],
+                                                double Ax[],
+                                                int Bp[],
+                                                int Bj[],
+                                                double Bx[],
+                                                int topn,
+                                                double lower_bound,
+                                                int Cp[],
+                                                int Cj[],
+                                                double Cx[],
+                                                int* n_minmax,
+                                                int n_jobs
+                                            );
+
+    cdef void sparse_dot_free_parallel(
+                                        int n_row,
+                                        int n_col,
+                                        int Ap[],
+                                        int Aj[],
+                                        double Ax[],
+                                        int Bp[],
+                                        int Bj[],
+                                        double Bx[],
+                                        double lower_bound,
+                                        int Cp[],
+                                        vector[int]* Cj,
+                                        vector[double]* Cx,
+                                        int* n_minmax,
+                                        int n_jobs
+                                    );
+
+    cdef void sparse_dot_only_max_nnz_col_parallel(
+                                                    int n_row,
+                                                    int n_col,
+                                                    int Ap[],
+                                                    int Aj[],
+                                                    int Bp[],
+                                                    int Bj[],
+                                                    int* max_nnz_col,
+                                                    int n_jobs
+                                                );
 
 cpdef sparse_dot_topn_threaded(
-        int n_row,
-        int n_col,
-        np.ndarray[int, ndim=1] a_indptr,
-        np.ndarray[int, ndim=1] a_indices,
-        np.ndarray[double, ndim=1] a_data,
-        np.ndarray[int, ndim=1] b_indptr,
-        np.ndarray[int, ndim=1] b_indices,
-        np.ndarray[double, ndim=1] b_data,
-        int ntop,
-        double lower_bound,
-        np.ndarray[int, ndim=1] c_indptr,
-        np.ndarray[int, ndim=1] c_indices,
-        np.ndarray[double, ndim=1] c_data,
-        int n_jobs
-    ):
+                                int n_row,
+                                int n_col,
+                                np.ndarray[int, ndim=1] a_indptr,
+                                np.ndarray[int, ndim=1] a_indices,
+                                np.ndarray[double, ndim=1] a_data,
+                                np.ndarray[int, ndim=1] b_indptr,
+                                np.ndarray[int, ndim=1] b_indices,
+                                np.ndarray[double, ndim=1] b_data,
+                                int ntop,
+                                double lower_bound,
+                                np.ndarray[int, ndim=1] c_indptr,
+                                np.ndarray[int, ndim=1] c_indices,
+                                np.ndarray[double, ndim=1] c_data,
+                                int n_jobs
+                            ):
 
     cdef int* Ap = &a_indptr[0]
     cdef int* Aj = &a_indices[0]
@@ -98,23 +125,23 @@ cpdef sparse_dot_topn_threaded(
                              lower_bound, Cp, Cj, Cx, n_jobs)
     return
 
-cpdef sparse_dot_plus_minmax_topn_threaded(
-        int n_row,
-        int n_col,
-        np.ndarray[int, ndim=1] a_indptr,
-        np.ndarray[int, ndim=1] a_indices,
-        np.ndarray[double, ndim=1] a_data,
-        np.ndarray[int, ndim=1] b_indptr,
-        np.ndarray[int, ndim=1] b_indices,
-        np.ndarray[double, ndim=1] b_data,
-        int ntop,
-        double lower_bound,
-        np.ndarray[int, ndim=1] c_indptr,
-        np.ndarray[int, ndim=1] c_indices,
-        np.ndarray[double, ndim=1] c_data,
-        np.ndarray[int, ndim=1] minmax_ntop,
-        int n_jobs
-    ):
+cpdef sparse_dot_topn_extd_threaded(
+                                int n_row,
+                                int n_col,
+                                np.ndarray[int, ndim=1] a_indptr,
+                                np.ndarray[int, ndim=1] a_indices,
+                                np.ndarray[double, ndim=1] a_data,
+                                np.ndarray[int, ndim=1] b_indptr,
+                                np.ndarray[int, ndim=1] b_indices,
+                                np.ndarray[double, ndim=1] b_data,
+                                int ntop,
+                                double lower_bound,
+                                np.ndarray[int, ndim=1] c_indptr,
+                                np.ndarray[int, ndim=1] c_indices,
+                                np.ndarray[double, ndim=1] c_data,
+                                np.ndarray[int, ndim=1] nminmax,
+                                int n_jobs
+                            ):
 
     cdef int* Ap = &a_indptr[0]
     cdef int* Aj = &a_indices[0]
@@ -125,28 +152,62 @@ cpdef sparse_dot_plus_minmax_topn_threaded(
     cdef int* Cp = &c_indptr[0]
     cdef int* Cj = &c_indices[0]
     cdef double* Cx = &c_data[0]
-    cdef int* o_minmax_ntop = &minmax_ntop[0]
+    cdef int* n_minmax = &nminmax[0]
 
-    sparse_dot_plus_minmax_topn_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop,
-                             lower_bound, Cp, Cj, Cx, o_minmax_ntop, n_jobs)
+    sparse_dot_topn_extd_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop,
+                             lower_bound, Cp, Cj, Cx, n_minmax, n_jobs)
     return
 
-cpdef sparse_dot_only_minmax_topn_threaded(
-        int n_row,
-        int n_col,
-        np.ndarray[int, ndim=1] a_indptr,
-        np.ndarray[int, ndim=1] a_indices,
-        np.ndarray[int, ndim=1] b_indptr,
-        np.ndarray[int, ndim=1] b_indices,
-        np.ndarray[int, ndim=1] minmax_ntop,
-        int n_jobs
-    ):
+cpdef sparse_dot_free_threaded(
+                                int n_row,
+                                int n_col,
+                                np.ndarray[int, ndim=1] a_indptr,
+                                np.ndarray[int, ndim=1] a_indices,
+                                np.ndarray[double, ndim=1] a_data,
+                                np.ndarray[int, ndim=1] b_indptr,
+                                np.ndarray[int, ndim=1] b_indices,
+                                np.ndarray[double, ndim=1] b_data,
+                                double lower_bound,
+                                np.ndarray[int, ndim=1] c_indptr,
+                                int n_jobs
+                            ):
+
+    cdef int* Ap = &a_indptr[0]
+    cdef int* Aj = &a_indices[0]
+    cdef double* Ax = &a_data[0]
+    cdef int* Bp = &b_indptr[0]
+    cdef int* Bj = &b_indices[0]
+    cdef double* Bx = &b_data[0]
+    cdef int* Cp = &c_indptr[0]
+    cdef np.ndarray[int, ndim=1] nminmax = np.array([0], dtype=np.int32)
+    cdef int* n_minmax = &nminmax[0]
+
+    cdef vector[int] vCj;
+    cdef vector[double] vCx;
+
+    sparse_dot_free_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx, n_minmax, n_jobs)
+    
+    c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0)
+    c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0)
+    
+    return c_indices, c_data, nminmax[0]
+
+cpdef sparse_dot_only_max_nnz_col_threaded(
+                                            int n_row,
+                                            int n_col,
+                                            np.ndarray[int, ndim=1] a_indptr,
+                                            np.ndarray[int, ndim=1] a_indices,
+                                            np.ndarray[int, ndim=1] b_indptr,
+                                            np.ndarray[int, ndim=1] b_indices,
+                                            np.ndarray[int, ndim=1] max_nnz_col,
+                                            int n_jobs
+                                        ):
 
     cdef int* Ap = &a_indptr[0]
     cdef int* Aj = &a_indices[0]
     cdef int* Bp = &b_indptr[0]
     cdef int* Bj = &b_indices[0]
-    cdef int* o_minmax_ntop = &minmax_ntop[0]
+    cdef int* o_max_nnz_col = &max_nnz_col[0]
 
-    sparse_dot_only_minmax_topn_parallel(n_row, n_col, Ap, Aj, Bp, Bj, o_minmax_ntop, n_jobs)
+    sparse_dot_only_max_nnz_col_parallel(n_row, n_col, Ap, Aj, Bp, Bj, o_max_nnz_col, n_jobs)
     return
diff --git a/sparse_dot_topn/test/test_awesome_cossim_topn.py b/sparse_dot_topn/test/test_awesome_cossim_topn.py
new file mode 100644
index 00000000..fb0d67ab
--- /dev/null
+++ b/sparse_dot_topn/test/test_awesome_cossim_topn.py
@@ -0,0 +1,346 @@
+# -*- coding: utf-8 -*-
+
+from sparse_dot_topn import awesome_cossim_topn
+from scipy.sparse.csr import csr_matrix
+from scipy.sparse import coo_matrix
+from scipy.sparse import rand
+import numpy as np
+import pandas as pd
+import multiprocessing
+import pytest
+
+PRUNE_THRESHOLD = 0.1
+NUM_CANDIDATES = 3
+MEM_MANAGER_IS_C = True
+USE_THREADS = True
+MAX_N_PROCESSES = min(8, multiprocessing.cpu_count()) - 1
+
+
+def get_n_top_sparse(mat, n_top=10):
+    """
+    Get list of (index, value) of the n largest elements in a 1-dimensional sparse matrix
+
+    :param mat: input sparse matrix
+    :param n_top: number of largest elements, default is 10.
+    :return: sorted list of largest elements
+    """
+    length = mat.getnnz()
+    if length == 0:
+        return None
+    if length <= n_top:
+        result = list(zip(mat.indices, mat.data))
+    else:
+        arg_idx = np.argpartition(mat.data, -n_top)[-n_top:]
+        result = list(zip(mat.indices[arg_idx], mat.data[arg_idx]))
+    return sorted(result, key=lambda x: -x[1])
+
+
+def helper_awesome_cossim_topn_dense(
+        a_dense,
+        b_dense,
+        mem_manager_is_C=False,
+        use_threads=False,
+        n_jobs=1
+    ):
+    dense_result = np.dot(a_dense, np.transpose(b_dense))  # dot product
+    sparse_result = csr_matrix(dense_result)
+    sparse_result_top3 = [get_n_top_sparse(row, NUM_CANDIDATES)
+                          for row in sparse_result]  # get ntop using the old method
+
+    pruned_dense_result = dense_result.copy()
+    pruned_dense_result[pruned_dense_result < PRUNE_THRESHOLD] = 0  # prune low similarity
+    pruned_sparse_result = csr_matrix(pruned_dense_result)
+    pruned_sparse_result_top3 = [get_n_top_sparse(row, NUM_CANDIDATES) for row in pruned_sparse_result]
+
+    a_csr = csr_matrix(a_dense)
+    b_csr_t = csr_matrix(b_dense).T
+
+    awesome_result = awesome_cossim_topn(
+        a_csr, b_csr_t, len(b_dense),
+        0.0,
+        mem_manager_is_C=mem_manager_is_C, 
+        use_threads=use_threads,
+        n_jobs=n_jobs
+    )
+    awesome_result_top3 = \
+        awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, 0.0, use_threads=use_threads, n_jobs=n_jobs)
+    awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
+        row.data) > 0 else None for row in awesome_result_top3]  # make comparable, normally not needed
+
+    pruned_awesome_result = awesome_cossim_topn(
+        a_csr,
+        b_csr_t,
+        len(b_dense),
+        PRUNE_THRESHOLD,
+        mem_manager_is_C=mem_manager_is_C,
+        use_threads=use_threads,
+        n_jobs=n_jobs
+    )
+    pruned_awesome_result_top3 = \
+        awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, PRUNE_THRESHOLD, use_threads=use_threads, n_jobs=n_jobs)
+    pruned_awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
+        row.data) > 0 else None for row in pruned_awesome_result_top3]
+
+    # no candidate selection, no pruning
+    assert awesome_result.nnz == sparse_result.nnz
+    # no candidate selection, below PRUNE_THRESHOLD similarity pruned
+    assert pruned_awesome_result.nnz == pruned_sparse_result.nnz
+
+    all_none1 = np.all(pd.isnull(awesome_result_top3)) and np.all(pd.isnull(sparse_result_top3))
+    all_none2 = np.all(pd.isnull(pruned_awesome_result_top3)) and np.all(pd.isnull(pruned_sparse_result_top3))
+
+    # top NUM_CANDIDATES candidates selected, no pruning
+    if not all_none1:
+        np.testing.assert_array_almost_equal(awesome_result_top3, sparse_result_top3)
+    else:
+        assert len(awesome_result_top3) == len(sparse_result_top3)
+    # top NUM_CANDIDATES candidates selected, below PRUNE_THRESHOLD similarity pruned
+    if not all_none2:
+        np.testing.assert_array_almost_equal(pruned_awesome_result_top3, pruned_sparse_result_top3)
+    else:
+        assert len(pruned_awesome_result_top3) == len(pruned_sparse_result_top3)
+
+
+def helper_awesome_cossim_topn_sparse(
+        a_sparse,
+        b_sparse,
+        flag=True,
+        mem_manager_is_C=False,
+        use_threads=False,
+        n_jobs=1
+    ):
+    # Note: helper function using awesome_cossim_topn
+    sparse_result = a_sparse.dot(b_sparse.T)  # dot product
+    sparse_result_top3 = [get_n_top_sparse(row, NUM_CANDIDATES)
+                          for row in sparse_result]  # get ntop using the old method
+
+    pruned_sparse_result = sparse_result.copy()
+    pruned_sparse_result[pruned_sparse_result < PRUNE_THRESHOLD] = 0  # prune low similarity
+    pruned_sparse_result.eliminate_zeros()
+    pruned_sparse_result_top3 = [get_n_top_sparse(row, NUM_CANDIDATES) for row in pruned_sparse_result]
+
+    a_csr = csr_matrix(a_sparse)
+    b_csr_t = csr_matrix(b_sparse).T
+
+    awesome_result = awesome_cossim_topn(
+        a_csr,
+        b_csr_t,
+        b_sparse.shape[0],
+        0.0,
+        mem_manager_is_C=mem_manager_is_C,
+        use_threads=use_threads,
+        n_jobs=n_jobs
+    )
+    awesome_result_top3 = \
+        awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, 0.0, use_threads=use_threads, n_jobs=n_jobs)
+    awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
+        row.data) > 0 else None for row in awesome_result_top3]  # make comparable, normally not needed
+
+    pruned_awesome_result = awesome_cossim_topn(
+        a_csr,
+        b_csr_t,
+        b_sparse.shape[0],
+        PRUNE_THRESHOLD,
+        mem_manager_is_C=mem_manager_is_C,
+        use_threads=use_threads,
+        n_jobs=n_jobs
+    )
+    pruned_awesome_result_top3 = \
+        awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, PRUNE_THRESHOLD, use_threads=use_threads, n_jobs=n_jobs)
+    pruned_awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
+        row.data) > 0 else None for row in pruned_awesome_result_top3]
+
+    # no candidate selection, no pruning
+    assert awesome_result.nnz == sparse_result.nnz
+    # no candidate selection, below PRUNE_THRESHOLD similarity pruned
+    assert pruned_awesome_result.nnz == pruned_sparse_result.nnz
+
+    if flag:
+        all_none1 = np.all(pd.isnull(awesome_result_top3)) and np.all(pd.isnull(sparse_result_top3))
+        all_none2 = np.all(pd.isnull(pruned_awesome_result_top3)) and np.all(pd.isnull(pruned_sparse_result_top3))
+
+        # top NUM_CANDIDATES candidates selected, no pruning
+        if not all_none1:
+            np.testing.assert_array_almost_equal(awesome_result_top3, sparse_result_top3)
+        else:
+            assert len(awesome_result_top3) == len(sparse_result_top3)
+        # top NUM_CANDIDATES candidates selected, below PRUNE_THRESHOLD similarity pruned
+        if not all_none2:
+            np.testing.assert_array_almost_equal(pruned_awesome_result_top3, pruned_sparse_result_top3)
+        else:
+            assert len(pruned_awesome_result_top3) == len(pruned_sparse_result_top3)
+    else:
+        assert awesome_result_top3 == sparse_result_top3
+        assert pruned_awesome_result_top3 == pruned_sparse_result_top3
+
+
+def test_awesome_cossim_topn_manually():
+    # a simple case
+    a_dense = [[0.2, 0.1, 0.0, 0.9, 0.3],
+               [0.7, 0.0, 0.0, 0.2, 0.2],
+               [0.0, 0.0, 0.0, 0.2, 0.1],
+               [0.5, 0.4, 0.5, 0.0, 0.0]]
+
+    b_dense = [[0.4, 0.2, 0.3, 0.2, 0.7],
+               [0.9, 0.4, 0.5, 0.1, 0.4],
+               [0.3, 0.8, 0.0, 0.2, 0.5],
+               [0.3, 0.0, 0.1, 0.1, 0.6],
+               [0.6, 0.1, 0.2, 0.8, 0.1],
+               [0.9, 0.1, 0.6, 0.4, 0.3]]
+    helper_awesome_cossim_topn_dense(a_dense, b_dense)
+    helper_awesome_cossim_topn_dense(a_dense, b_dense, mem_manager_is_C=MEM_MANAGER_IS_C)
+    for process in range(MAX_N_PROCESSES):
+        n_jobs = process + 1
+        helper_awesome_cossim_topn_dense(a_dense, b_dense, use_threads=USE_THREADS, n_jobs=n_jobs)
+        helper_awesome_cossim_topn_dense(
+            a_dense,
+            b_dense,
+            mem_manager_is_C=MEM_MANAGER_IS_C,
+            use_threads=USE_THREADS,
+            n_jobs=n_jobs
+        )
+
+    # boundary checking, there is no matching at all in this case
+    c_dense = [[0.2, 0.1, 0.3, 0, 0],
+               [0.7, 0.2, 0.7, 0, 0],
+               [0.3, 0.9, 0.6, 0, 0],
+               [0.5, 0.4, 0.5, 0, 0]]
+    d_dense = [[0, 0, 0, 0.6, 0.9],
+               [0, 0, 0, 0.1, 0.1],
+               [0, 0, 0, 0.2, 0.6],
+               [0, 0, 0, 0.8, 0.4],
+               [0, 0, 0, 0.1, 0.3],
+               [0, 0, 0, 0.7, 0.5]]
+    helper_awesome_cossim_topn_dense(c_dense, d_dense)
+    helper_awesome_cossim_topn_dense(c_dense, d_dense, mem_manager_is_C=MEM_MANAGER_IS_C)
+    for process in range(MAX_N_PROCESSES):
+        n_jobs = process + 1
+        helper_awesome_cossim_topn_dense(c_dense, d_dense, use_threads=USE_THREADS, n_jobs=n_jobs)
+        helper_awesome_cossim_topn_dense(
+            c_dense,
+            d_dense,
+            mem_manager_is_C=MEM_MANAGER_IS_C,
+            use_threads=USE_THREADS,
+            n_jobs=n_jobs
+        )
+
+
+@pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero")
+@pytest.mark.filterwarnings("ignore:Changing the sparsity structure of a csr_matrix is expensive")
+def test_awesome_cossim_top_one_zeros():
+    # test with one row matrix with all zeros
+    # helper_awesome_cossim_top_sparse uses a local function awesome_cossim_top
+    nr_vocab = 1000
+    density = 0.1
+    for _ in range(3):
+        a_sparse = csr_matrix(np.zeros((1, nr_vocab)))
+        b_sparse = rand(800, nr_vocab, density=density, format='csr')
+        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse)
+        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, mem_manager_is_C=MEM_MANAGER_IS_C)
+        for process in range(MAX_N_PROCESSES):
+            n_jobs = process + 1
+            helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, use_threads=USE_THREADS, n_jobs=n_jobs)
+            helper_awesome_cossim_topn_sparse(
+                a_sparse,
+                b_sparse,
+                mem_manager_is_C=MEM_MANAGER_IS_C,
+                use_threads=USE_THREADS,
+                n_jobs=n_jobs
+            )
+
+
+@pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero")
+@pytest.mark.filterwarnings("ignore:Changing the sparsity structure of a csr_matrix is expensive")
+def test_awesome_cossim_top_all_zeros():
+    # test with all zeros matrix
+    # helper_awesome_cossim_top_sparse uses a local function awesome_cossim_top
+    nr_vocab = 1000
+    density = 0.1
+    for _ in range(3):
+        a_sparse = csr_matrix(np.zeros((2, nr_vocab)))
+        b_sparse = rand(800, nr_vocab, density=density, format='csr')
+        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse)
+        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, mem_manager_is_C=MEM_MANAGER_IS_C)
+        for process in range(MAX_N_PROCESSES):
+            n_jobs = process + 1
+            helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, use_threads=USE_THREADS, n_jobs=n_jobs)
+            helper_awesome_cossim_topn_sparse(
+                a_sparse,
+                b_sparse,
+                mem_manager_is_C=MEM_MANAGER_IS_C,
+                use_threads=USE_THREADS,
+                n_jobs=n_jobs
+            )
+
+
+@pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero")
+@pytest.mark.filterwarnings("ignore:Changing the sparsity structure of a csr_matrix is expensive")
+def test_awesome_cossim_top_small_matrix():
+    # test with small matrix
+    nr_vocab = 1000
+    density = 0.1
+    for _ in range(10):
+        a_sparse = rand(300, nr_vocab, density=density, format='csr')
+        b_sparse = rand(800, nr_vocab, density=density, format='csr')
+        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False)
+        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, mem_manager_is_C=MEM_MANAGER_IS_C)
+        for process in range(MAX_N_PROCESSES):
+            n_jobs = process + 1
+            helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, use_threads=USE_THREADS, n_jobs=n_jobs)
+            helper_awesome_cossim_topn_sparse(
+                a_sparse,
+                b_sparse,
+                False,
+                mem_manager_is_C=MEM_MANAGER_IS_C,
+                use_threads=USE_THREADS,
+                n_jobs=n_jobs
+            )
+
+
+@pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero")
+@pytest.mark.filterwarnings("ignore:Changing the sparsity structure of a csr_matrix is expensive")
+def test_awesome_cossim_top_large_matrix():
+    # MB: I reduced the size of the matrix so the test also runs in small memory.
+    # test with large matrix
+    nr_vocab = 2 << 24
+    density = 1e-6
+    n_samples = 10000
+    nnz = int(n_samples * nr_vocab * density)
+
+    rng1 = np.random.RandomState(42)
+    rng2 = np.random.RandomState(43)
+
+    for _ in range(1):
+        # scipy.sparse.rand has very high memory usage
+        # see for details: https://github.com/scipy/scipy/issues/9699
+        # a_sparse = rand(500, nr_vocab, density=density, format='csr')
+        # b_sparse = rand(80000, nr_vocab, density=density, format='csr')
+
+        # switching to alternative random method below, which is also a lot faster
+        row = rng1.randint(500, size=nnz)
+        cols = rng2.randint(nr_vocab, size=nnz)
+        data = rng1.rand(nnz)
+
+        a_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab))
+        a_sparse = a_sparse.tocsr()
+
+        row = rng1.randint(n_samples, size=nnz)
+        cols = rng2.randint(nr_vocab, size=nnz)
+        data = rng1.rand(nnz)
+
+        b_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab))
+        b_sparse = b_sparse.tocsr()
+
+        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False)
+        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, mem_manager_is_C=MEM_MANAGER_IS_C)
+        for process in range(MAX_N_PROCESSES):
+            n_jobs = process + 1
+            helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, use_threads=USE_THREADS, n_jobs=n_jobs)
+            helper_awesome_cossim_topn_sparse(
+                a_sparse,
+                b_sparse,
+                False,
+                mem_manager_is_C=MEM_MANAGER_IS_C,
+                use_threads=USE_THREADS,
+                n_jobs=n_jobs
+            )
diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index 69ecd912..1ea3b1a9 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -6,7 +6,7 @@
 from scipy.sparse.csr import csr_matrix
 from scipy.sparse.csgraph import connected_components
 from typing import Tuple, NamedTuple, List, Optional, Union
-from sparse_dot_topn import awesome_cossim_topn, awesome_cossim_true_minmax_topn_only
+from sparse_dot_topn import awesome_cossim_topn
 from functools import wraps
 
 DEFAULT_NGRAM_SIZE: int = 3
@@ -219,16 +219,16 @@ def __init__(self, master: pd.Series,
         self._master_id: pd.Series = master_id if master_id is not None else None
         self._duplicates_id: pd.Series = duplicates_id if duplicates_id is not None else None
         self._config: StringGrouperConfig = StringGrouperConfig(**kwargs)
-        self._max_n_matches = DEFAULT_MAX_N_MATCHES if self._config.max_n_matches is None \
+        self._max_n_matches = len(self._master) if self._config.max_n_matches is None \
             else self._config.max_n_matches
         self._validate_group_rep_specs()
         self._validate_replace_na_and_drop()
         self.is_build = False  # indicates if the grouper was fit or not
         self._vectorizer = TfidfVectorizer(min_df=1, analyzer=self.n_grams)
-        # After the StringGrouper is built, _matches_list will contain the indices and similarities of two matches
-        # and _true_max_n_matches will contain the true maximum number of matches over all strings in master if 
-        # self._config.min_similarity <= 0 
+        # After the StringGrouper is built, _matches_list will contain the indices and similarities of the matches
         self._matches_list: pd.DataFrame = pd.DataFrame()
+        # _true_max_n_matches will contain the true maximum number of matches over all strings in master if 
+        # self._config.min_similarity <= 0 
         self._true_max_n_matches = None
 
     def n_grams(self, string: str) -> List[str]:
@@ -248,7 +248,7 @@ def fit(self) -> 'StringGrouper':
         """Builds the _matches list which contains string matches indices and similarity"""
         master_matrix, duplicate_matrix = self._get_tf_idf_matrices()
         # Calculate the matches using the cosine similarity
-        matches = self._build_matches(master_matrix, duplicate_matrix)
+        matches, self._true_max_n_matches = self._build_matches(master_matrix, duplicate_matrix)
         if self._duplicates is None and self._max_n_matches < self._true_max_n_matches:
             # the list of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A)
             matches = StringGrouper._symmetrize_matrix(matches)
@@ -435,21 +435,12 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix
         optional_kwargs = dict()
         if self._config.number_of_processes > 1:
             optional_kwargs = {
+                'ntop_is_flexible': self._config.max_n_matches is None,
+                'return_best_topn': True,
                 'use_threads': True,
                 'n_jobs': self._config.number_of_processes
             }
 
-        # compute the true maximum number of matches over all strings in master:
-        self._true_max_n_matches = awesome_cossim_true_minmax_topn_only(
-            tf_idf_matrix_1,
-            tf_idf_matrix_2,
-            **optional_kwargs
-        )
-
-        if self._config.min_similarity <= 0 and self._config.max_n_matches is None:
-            # if kwarg max_n_matches was not set when min_similarity <= 0 then set it now to its true value
-            self._max_n_matches = self._true_max_n_matches
-
         return awesome_cossim_topn(
             tf_idf_matrix_1, tf_idf_matrix_2,
             self._max_n_matches,
diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py
index c928bfa3..d5c1dd0b 100644
--- a/string_grouper/test/test_string_grouper.py
+++ b/string_grouper/test/test_string_grouper.py
@@ -334,7 +334,7 @@ def test_build_matches(self):
         expected_matches = np.array([[1., 0., 0.],
                                      [0., 1., 0.],
                                      [0., 0., 0.]])
-        np.testing.assert_array_equal(expected_matches, sg._build_matches(master, dupe).toarray())
+        np.testing.assert_array_equal(expected_matches, sg._build_matches(master, dupe)[0].toarray())
 
     def test_build_matches_list(self):
         """Should create the cosine similarity matrix of two series"""

From 5a12efbf9c7daa0b9df781d1b9964df7839a7a9d Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Sat, 24 Apr 2021 00:31:55 +0200
Subject: [PATCH 08/29] defragmented temporary memory allocations in
 sparse_dot_topn routines

---
 setup.py                                     |   3 +
 sparse_dot_topn/array_wrappers.pxd           |  16 +-
 sparse_dot_topn/array_wrappers.pyx           | 116 ++--
 sparse_dot_topn/awesome_cossim_topn.py       | 449 ++++++-------
 sparse_dot_topn/sparse_dot_topn.pyx          | 498 +++++++-------
 sparse_dot_topn/sparse_dot_topn_parallel.cpp | 406 ++++++------
 sparse_dot_topn/sparse_dot_topn_source.cpp   | 658 ++++++++++---------
 sparse_dot_topn/sparse_dot_topn_threaded.pyx | 346 +++++-----
 string_grouper/string_grouper.py             |   4 +-
 9 files changed, 1257 insertions(+), 1239 deletions(-)

diff --git a/setup.py b/setup.py
index 5cb9c5e0..577ed0d9 100644
--- a/setup.py
+++ b/setup.py
@@ -35,6 +35,7 @@ def finalize_options(self):
                                     './sparse_dot_topn/sparse_dot_topn_source.cpp'
                                 ],
                          extra_compile_args=extra_compile_args,
+                         define_macros=[('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')],
                          language='c++')
 
 original_ext = Extension('sparse_dot_topn.sparse_dot_topn',
@@ -43,6 +44,7 @@ def finalize_options(self):
                                     './sparse_dot_topn/sparse_dot_topn_source.cpp'
                                 ],
                          extra_compile_args=extra_compile_args,
+                         define_macros=[('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')],
                          language='c++')
 
 threaded_ext = Extension('sparse_dot_topn.sparse_dot_topn_threaded',
@@ -52,6 +54,7 @@ def finalize_options(self):
                              './sparse_dot_topn/sparse_dot_topn_parallel.cpp'
                             ],
                          extra_compile_args=extra_compile_args,
+                         define_macros=[('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')],
                          language='c++')
 
 setup(
diff --git a/sparse_dot_topn/array_wrappers.pxd b/sparse_dot_topn/array_wrappers.pxd
index f3342ef5..d77e41b3 100644
--- a/sparse_dot_topn/array_wrappers.pxd
+++ b/sparse_dot_topn/array_wrappers.pxd
@@ -2,17 +2,17 @@ from libcpp.vector cimport vector
 
 # define a Cython array wrapper class to hold a C++ vector of ints, adhering to numpy's buffer protocol:
 cdef class ArrayWrapper_int:
-    cdef int view_count
-    cdef vector[int] vec
-    cdef Py_ssize_t shape[2]
-    cdef Py_ssize_t strides[2]
+	cdef int view_count
+	cdef vector[int] vec
+	cdef Py_ssize_t shape[2]
+	cdef Py_ssize_t strides[2]
 
 
 # define a Cython array wrapper class to hold a C++ vector of doubles, adhering to numpy's buffer protocol:
 cdef class ArrayWrapper_double:
-    cdef int view_count
-    cdef vector[double] vec
-    cdef Py_ssize_t shape[2]
-    cdef Py_ssize_t strides[2]
+	cdef int view_count
+	cdef vector[double] vec
+	cdef Py_ssize_t shape[2]
+	cdef Py_ssize_t strides[2]
 
 
diff --git a/sparse_dot_topn/array_wrappers.pyx b/sparse_dot_topn/array_wrappers.pyx
index d0dd4f3e..ee458629 100644
--- a/sparse_dot_topn/array_wrappers.pyx
+++ b/sparse_dot_topn/array_wrappers.pyx
@@ -3,71 +3,71 @@ from libcpp.vector cimport vector
 
 # define a Cython array wrapper class to hold a C++ vector of ints, adhering to numpy's buffer protocol:
 cdef class ArrayWrapper_int:
-    # constructor and destructor are fairly unimportant now since
-    # vec will be destroyed automatically.
+	# constructor and destructor are fairly unimportant now since
+	# vec will be destroyed automatically.
 
-    def __cinit__(self, vector[int]& data):
-        self.vec.swap(data)
-        self.view_count = 0
+	def __cinit__(self, vector[int]& data):
+		self.vec.swap(data)
+		self.view_count = 0
 
-    # now implement the buffer protocol for the class
-    # which makes it generally useful to anything that expects an array
-    def __getbuffer__(self, Py_buffer *buffer, int flags):
-        # relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class
-        cdef Py_ssize_t itemsize = sizeof(self.vec[0])
+	# now implement the buffer protocol for the class
+	# which makes it generally useful to anything that expects an array
+	def __getbuffer__(self, Py_buffer *buffer, int flags):
+		# relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class
+		cdef Py_ssize_t itemsize = sizeof(self.vec[0])
 
-        self.shape[1] = self.vec.size()
-        self.shape[0] = 1
-        self.strides[1] = <Py_ssize_t>(  <char *>&(self.vec[1]) - <char *>&(self.vec[0]))
-        self.strides[0] = self.vec.size() * self.strides[1]
-        buffer.buf = <char *>&(self.vec[0])
-        buffer.format = 'i'
-        buffer.internal = NULL
-        buffer.itemsize = itemsize
-        buffer.len = self.vec.size() * itemsize   # product(shape) * itemsize
-        buffer.ndim = 2
-        buffer.obj = self
-        buffer.readonly = 0
-        buffer.shape = self.shape
-        buffer.strides = self.strides
-        buffer.suboffsets = NULL
-        self.view_count += 1
-        
-    def __releasebuffer__(self, Py_buffer *buffer):
-        self.view_count -= 1
+		self.shape[1] = self.vec.size()
+		self.shape[0] = 1
+		self.strides[1] = <Py_ssize_t>(  <char *>&(self.vec[1]) - <char *>&(self.vec[0]))
+		self.strides[0] = self.vec.size() * self.strides[1]
+		buffer.buf = <char *>&(self.vec[0])
+		buffer.format = 'i'
+		buffer.internal = NULL
+		buffer.itemsize = itemsize
+		buffer.len = self.vec.size() * itemsize   # product(shape) * itemsize
+		buffer.ndim = 2
+		buffer.obj = self
+		buffer.readonly = 0
+		buffer.shape = self.shape
+		buffer.strides = self.strides
+		buffer.suboffsets = NULL
+		self.view_count += 1
+		
+	def __releasebuffer__(self, Py_buffer *buffer):
+		self.view_count -= 1
 
 
 # define a Cython array wrapper class to hold a C++ vector of doubles, adhering to numpy's buffer protocol:
 cdef class ArrayWrapper_double:
-    # constructor and destructor are fairly unimportant now since
-    # vec will be destroyed automatically.
+	# constructor and destructor are fairly unimportant now since
+	# vec will be destroyed automatically.
 
-    def __cinit__(self, vector[double]& data):
-        self.vec.swap(data)
-        self.view_count = 0
+	def __cinit__(self, vector[double]& data):
+		self.vec.swap(data)
+		self.view_count = 0
 
-    # now implement the buffer protocol for the class
-    # which makes it generally useful to anything that expects an array
-    def __getbuffer__(self, Py_buffer *buffer, int flags):
-        # relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class
-        cdef Py_ssize_t itemsize = sizeof(self.vec[0])
+	# now implement the buffer protocol for the class
+	# which makes it generally useful to anything that expects an array
+	def __getbuffer__(self, Py_buffer *buffer, int flags):
+		# relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class
+		cdef Py_ssize_t itemsize = sizeof(self.vec[0])
 
-        self.shape[1] = self.vec.size()
-        self.shape[0] = 1
-        self.strides[1] = <Py_ssize_t>(  <char *>&(self.vec[1]) - <char *>&(self.vec[0]))
-        self.strides[0] = self.vec.size() * self.strides[1]
-        buffer.buf = <char *>&(self.vec[0])
-        buffer.format = 'd'
-        buffer.internal = NULL
-        buffer.itemsize = itemsize
-        buffer.len = self.vec.size() * itemsize   # product(shape) * itemsize
-        buffer.ndim = 2
-        buffer.obj = self
-        buffer.readonly = 0
-        buffer.shape = self.shape
-        buffer.strides = self.strides
-        buffer.suboffsets = NULL
-        self.view_count += 1
-        
-    def __releasebuffer__(self, Py_buffer *buffer):
-        self.view_count -= 1
+		self.shape[1] = self.vec.size()
+		self.shape[0] = 1
+		self.strides[1] = <Py_ssize_t>(  <char *>&(self.vec[1]) - <char *>&(self.vec[0]))
+		self.strides[0] = self.vec.size() * self.strides[1]
+		buffer.buf = <char *>&(self.vec[0])
+		buffer.format = 'd'
+		buffer.internal = NULL
+		buffer.itemsize = itemsize
+		buffer.len = self.vec.size() * itemsize   # product(shape) * itemsize
+		buffer.ndim = 2
+		buffer.obj = self
+		buffer.readonly = 0
+		buffer.shape = self.shape
+		buffer.strides = self.strides
+		buffer.suboffsets = NULL
+		self.view_count += 1
+		
+	def __releasebuffer__(self, Py_buffer *buffer):
+		self.view_count -= 1
diff --git a/sparse_dot_topn/awesome_cossim_topn.py b/sparse_dot_topn/awesome_cossim_topn.py
index 6e459b29..efce38bd 100644
--- a/sparse_dot_topn/awesome_cossim_topn.py
+++ b/sparse_dot_topn/awesome_cossim_topn.py
@@ -4,231 +4,238 @@
 from scipy.sparse import isspmatrix_csr
 
 if sys.version_info[0] >= 3:
-    from sparse_dot_topn import sparse_dot_topn as ct
-    from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread
+	from sparse_dot_topn import sparse_dot_topn as ct
+	from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread
 else:
-    import sparse_dot_topn as ct
-    import sparse_dot_topn_threaded as ct_thread
+	import sparse_dot_topn as ct
+	import sparse_dot_topn_threaded as ct_thread
 
 
 def awesome_cossim_topn(
-        A,
-        B,
-        ntop,
-        lower_bound=0,
-        use_threads=False,
-        n_jobs=1,
-        ntop_is_flexible=False,
-        mem_manager_is_C=False,
-        return_best_topn=False
-    ):
-    """
-    This function will return a matrix C in CSR format, where
-    C = [sorted top n results > lower_bound for each row of A * B].
-    If return_best_topn=True it will also return best_topn (the 
-    true maximum number of elements > lower_bound per row of A * B).
-
-    Input:
-        A and B: two CSR matrices
-        ntop: n top results
-        lower_bound: a threshold that the element of A*B must greater than
-                     use_threads: use multi-thread or not
-        n_jobs: number of thread, must be >= 1
-        ntop_is_flexible: if True, memory management will be handed over to C/C++ if 
-                          python's attempt at allocating memory fails.
-        mem_manager_is_C: (this is mainly for testing purposes) if True, will force
-                          memory management to be handed over to C/C++. Should be
-                          used only when ntop >= number of columns of B or 
-                          ntop_is_flexible=True.  Defaults to False.
-        return_best_topn: if True, will return best_topn together with C as a tuple:
-                          (C, best_topn)
-
-    Output:
-        C: result matrix (returned alone, if return_best_topn=False)
-        best_topn: The true maximum number of elements > lower_bound per row of 
-                   A * B returned together with C as a tuple: (C, best_topn). It is 
-                   returned only if return_best_topn=True.
-
-    N.B. if A and B are not in CSR format, they will be converted to CSR
-    """
-    if not isspmatrix_csr(A):
-        A = A.tocsr()
-
-    if not isspmatrix_csr(B):
-        B = B.tocsr()
-
-    M, K1 = A.shape
-    K2, N = B.shape
-
-    if K1 != K2:
-        err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!'
-        raise ValueError(err_str)
-
-    idx_dtype = np.int32
-
-    nnz_max = M*ntop
-
-    # basic check. if A or B are all zeros matrix, return all zero matrix directly
-    if len(A.indices) == 0 or len(B.indices) == 0:
-        indptr = np.zeros(M + 1, dtype=idx_dtype)
-        indices = np.zeros(nnz_max, dtype=idx_dtype)
-        data = np.zeros(nnz_max, dtype=A.dtype)
-        output = csr_matrix((data, indices, indptr), shape=(M, N))
-        if return_best_topn:
-            return output, 0
-        else:
-            return output
-
-    # filled matrices from here on
-    indptr = np.empty(M+1, dtype=idx_dtype)
-    try:
-        indices = np.empty(nnz_max, dtype=idx_dtype)
-        data = np.empty(nnz_max, dtype=A.dtype)
-        if mem_manager_is_C: raise MemoryError    # This is mainly for testing purposes
-    except MemoryError:
-        # if mem_manager_is_C: print('Exception raised! Continuing ...', flush=True)
-        if ntop_is_flexible or ntop >= N:
-        # It is likely you are here because nnz_max is too large. But don't give up just yet! 
-        # sparse_dot_topn will hand over the memory allocation/management to C++.  C++ will
-        # grow the memory allocations for these arrays as needed without any need for nnz_max.
-        # Note that reallocations could occur causing data to be copied to other locations 
-        # in memory thus impacting performance
-            indices = np.empty(0, dtype=idx_dtype)
-            data = np.empty(0, dtype=A.dtype)
-            if not use_threads:
-    
-                indices, data, best_topn = ct.sparse_dot_free(
-                    M, N, np.asarray(A.indptr, dtype=idx_dtype),
-                    np.asarray(A.indices, dtype=idx_dtype),
-                    A.data,
-                    np.asarray(B.indptr, dtype=idx_dtype),
-                    np.asarray(B.indices, dtype=idx_dtype),
-                    B.data,
-                    lower_bound,
-                    indptr
-                )
-                
-            else:
-    
-                indices, data, best_topn = ct_thread.sparse_dot_free_threaded(
-                    M, N, np.asarray(A.indptr, dtype=idx_dtype),
-                    np.asarray(A.indices, dtype=idx_dtype),
-                    A.data,
-                    np.asarray(B.indptr, dtype=idx_dtype),
-                    np.asarray(B.indices, dtype=idx_dtype),
-                    B.data,
-                    lower_bound,
-                    indptr, n_jobs
-                )
-
-        else:
-            if mem_manager_is_C:
-                raise Exception('When mem_manager_is_C=True, set ntop >= N, or set ntop_is_flexible=True')
-            else:
-                raise Exception('Not enough memory!  Data array is too large. Try reducing the value of ntop.')
-            
-    else:
-        
-        best_topn_arr = np.full(1, 0, dtype=idx_dtype)
-        
-        if not use_threads:
-        
-            ct.sparse_dot_topn_extd(
-                M, N, np.asarray(A.indptr, dtype=idx_dtype),
-                np.asarray(A.indices, dtype=idx_dtype),
-                A.data,
-                np.asarray(B.indptr, dtype=idx_dtype),
-                np.asarray(B.indices, dtype=idx_dtype),
-                B.data,
-                ntop,
-                lower_bound,
-                indptr, indices, data, best_topn_arr
-            )
-    
-        else:
-            if n_jobs < 1:
-                err_str = 'Whenever you select the multi-thread mode, n_job must be greater than or equal to 1!'
-                raise ValueError(err_str)
-    
-            ct_thread.sparse_dot_topn_extd_threaded(
-                M, N, np.asarray(A.indptr, dtype=idx_dtype),
-                np.asarray(A.indices, dtype=idx_dtype),
-                A.data,
-                np.asarray(B.indptr, dtype=idx_dtype),
-                np.asarray(B.indices, dtype=idx_dtype),
-                B.data,
-                ntop,
-                lower_bound,
-                indptr, indices, data, best_topn_arr, n_jobs
-            )
-        
-        best_topn = best_topn_arr[0]
-    
-    output = csr_matrix((data, indices, indptr), shape=(M, N))
-    if return_best_topn:
-        return output, best_topn
-    else:
-        return output
+		A,
+		B,
+		ntop,
+		lower_bound=0,
+		use_threads=False,
+		n_jobs=1,
+		ntop_is_flexible=False,
+		mem_manager_is_C=False,
+		return_best_topn=False
+	):
+	"""
+	This function will return a matrix C in CSR format, where
+	C = [sorted top n results > lower_bound for each row of A * B].
+	If return_best_topn=True then best_topn
+	(the true maximum number of elements > lower_bound per row of A * B)
+	will also be returned in a tuple together with C as (C, best_topn).
+
+	Input:
+		A and B: two CSR matrices
+		ntop: top n results
+		lower_bound: a threshold that the element of A*B must be greater than
+		use_threads: use multi-thread or not
+		n_jobs: number of thread, must be >= 1
+		ntop_is_flexible: (default: False) if True, memory management will be handed 
+						  over to C/C++ whenever python's attempt at allocating
+						  memory fails.
+		mem_manager_is_C: (default: False) this is mainly for testing purposes. if 
+						  True, will force memory management to be handed over to
+						  C/C++. Should be used only when ntop >= number of columns 
+						  of B or ntop_is_flexible=True.
+		return_best_topn: (default: False) if True, will return best_topn together 
+						  with C as a tuple: (C, best_topn)
+
+	Output:
+		C: result matrix (returned alone, if return_best_topn=False)
+		best_topn: The true maximum number of elements > lower_bound per row of 
+				   A * B returned together with C as a tuple: (C, best_topn). It is 
+				   returned only if return_best_topn=True.
+
+	N.B. if A and B are not in CSR format, they will be converted to CSR
+	"""
+	if not isspmatrix_csr(A):
+		A = A.tocsr()
+
+	if not isspmatrix_csr(B):
+		B = B.tocsr()
+
+	M, K1 = A.shape
+	K2, N = B.shape
+
+	if K1 != K2:
+		err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!'
+		raise ValueError(err_str)
+
+	idx_dtype = np.int32
+
+	nnz_max = M*ntop
+
+	# basic check. if A or B are all zeros matrix, return all zero matrix directly
+	if len(A.indices) == 0 or len(B.indices) == 0:
+		indptr = np.zeros(M + 1, dtype=idx_dtype)
+		indices = np.zeros(nnz_max, dtype=idx_dtype)
+		data = np.zeros(nnz_max, dtype=A.dtype)
+		output = csr_matrix((data, indices, indptr), shape=(M, N))
+		if return_best_topn:
+			return output, 0
+		else:
+			return output
+
+	# filled matrices from here on
+	indptr = np.empty(M + 1, dtype=idx_dtype)
+	try:
+		indices = np.empty(nnz_max, dtype=idx_dtype)
+		data = np.empty(nnz_max, dtype=A.dtype)
+		
+		if mem_manager_is_C: raise MemoryError	# This is mainly for testing purposes
+		
+	except MemoryError:
+		# if mem_manager_is_C: print('Exception raised! Continuing ...', flush=True)
+		if ntop_is_flexible or ntop >= N:
+		# It is likely you are here because nnz_max is too large. But don't give up just yet! 
+		# sparse_dot_topn will hand over the memory allocation/management to C++.  C++ will
+		# grow the memory allocations for these arrays as needed without any need for nnz_max.
+		# Note that reallocations could occur causing data to be copied to other locations 
+		# in memory thus impacting performance
+			indices = np.empty(0, dtype=idx_dtype)
+			data = np.empty(0, dtype=A.dtype)
+			if not use_threads:
+	
+				indices, data, best_topn = ct.sparse_dot_free(
+					M, N, np.asarray(A.indptr, dtype=idx_dtype),
+					np.asarray(A.indices, dtype=idx_dtype),
+					A.data,
+					np.asarray(B.indptr, dtype=idx_dtype),
+					np.asarray(B.indices, dtype=idx_dtype),
+					B.data,
+					lower_bound,
+					indptr
+				)
+			else:
+	
+				indices, data, best_topn = ct_thread.sparse_dot_free_threaded(
+					M, N, np.asarray(A.indptr, dtype=idx_dtype),
+					np.asarray(A.indices, dtype=idx_dtype),
+					A.data,
+					np.asarray(B.indptr, dtype=idx_dtype),
+					np.asarray(B.indices, dtype=idx_dtype),
+					B.data,
+					lower_bound,
+					indptr, n_jobs
+				)
+		else:
+
+			if mem_manager_is_C:
+				raise Exception(
+					'When mem_manager_is_C=True, set ntop >= B.shape[1], or set ntop_is_flexible=True'
+				)
+			else:
+				raise Exception(
+					'Not enough memory!  Data array is too large. Try reducing the value of ntop.'
+					'or set ntop_is_flexible=True'
+				)
+	else:
+		# no exception was raised; then use old function (as it is expected to be the fastest)
+		
+		best_topn_arr = np.full(1, 0, dtype=idx_dtype)
+		
+		if not use_threads:
+		
+			ct.sparse_dot_topn_extd(
+				M, N, np.asarray(A.indptr, dtype=idx_dtype),
+				np.asarray(A.indices, dtype=idx_dtype),
+				A.data,
+				np.asarray(B.indptr, dtype=idx_dtype),
+				np.asarray(B.indices, dtype=idx_dtype),
+				B.data,
+				ntop,
+				lower_bound,
+				indptr, indices, data, best_topn_arr
+			)
+		else:
+			if n_jobs < 1:
+				err_str = 'Whenever you select the multi-thread mode, n_job must be greater than or equal to 1!'
+				raise ValueError(err_str)
+	
+			ct_thread.sparse_dot_topn_extd_threaded(
+				M, N, np.asarray(A.indptr, dtype=idx_dtype),
+				np.asarray(A.indices, dtype=idx_dtype),
+				A.data,
+				np.asarray(B.indptr, dtype=idx_dtype),
+				np.asarray(B.indices, dtype=idx_dtype),
+				B.data,
+				ntop,
+				lower_bound,
+				indptr, indices, data, best_topn_arr, n_jobs
+			)
+		best_topn = best_topn_arr[0]
+	
+	# prepare and return the output:
+	output = csr_matrix((data, indices, indptr), shape=(M, N))
+	if return_best_topn:
+		return output, best_topn
+	else:
+		return output
 
 
 def awesome_cossim_only_max_nnz_col(A, B, use_threads=False, n_jobs=1):
-    """
-    This function will return the maximum number of columns set
-    per row over all rows of A * B
-
-    Input:
-        A and B: two CSR matrix
-        use_threads: use multi-thread or not
-        n_jobs: number of thread, must be >= 1
-
-    Output:
-        minmax_topn: maximum number of columns set
-                     per row over all rows of A * B
-
-    N.B. if A and B are not CSR format, they will be converted to CSR
-    """
-    if not isspmatrix_csr(A):
-        A = A.tocsr()
-
-    if not isspmatrix_csr(B):
-        B = B.tocsr()
-
-    M, K1 = A.shape
-    K2, N = B.shape
-
-    if K1 != K2:
-        err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!'
-        raise ValueError(err_str)
-
-    idx_dtype = np.int32
-
-    minmax_topn = np.full(1, 0, dtype=idx_dtype)
-
-    # basic check. if A or B are all zeros matrix, return 0 directly
-    if len(A.indices) == 0 or len(B.indices) == 0:
-        return 0
-
-    if not use_threads:
-
-        ct.sparse_dot_only_max_nnz_col(
-            M, N,
-            np.asarray(A.indptr, dtype=idx_dtype),
-            np.asarray(A.indices, dtype=idx_dtype),
-            np.asarray(B.indptr, dtype=idx_dtype),
-            np.asarray(B.indices, dtype=idx_dtype),
-            minmax_topn)
-
-    else:
-        if n_jobs < 1:
-            err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!'
-            raise ValueError(err_str)
-
-        ct_thread.sparse_dot_only_max_nnz_col_threaded(
-            M, N,
-            np.asarray(A.indptr, dtype=idx_dtype),
-            np.asarray(A.indices, dtype=idx_dtype),
-            np.asarray(B.indptr, dtype=idx_dtype),
-            np.asarray(B.indices, dtype=idx_dtype),
-            minmax_topn, n_jobs)
-
-    return minmax_topn[0]
+	"""
+	This function will return the maximum number of columns set
+	per row over all rows of A * B
+
+	Input:
+		A and B: two CSR matrix
+		use_threads: use multi-thread or not
+		n_jobs: number of thread, must be >= 1
+
+	Output:
+		minmax_topn: maximum number of columns set
+					 per row over all rows of A * B
+
+	N.B. if A and B are not CSR format, they will be converted to CSR
+	"""
+	if not isspmatrix_csr(A):
+		A = A.tocsr()
+
+	if not isspmatrix_csr(B):
+		B = B.tocsr()
+
+	M, K1 = A.shape
+	K2, N = B.shape
+
+	if K1 != K2:
+		err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!'
+		raise ValueError(err_str)
+
+	idx_dtype = np.int32
+
+	minmax_topn = np.full(1, 0, dtype=idx_dtype)
+
+	# basic check. if A or B are all zeros matrix, return 0 directly
+	if len(A.indices) == 0 or len(B.indices) == 0:
+		return 0
+
+	if not use_threads:
+
+		ct.sparse_dot_only_max_nnz_col(
+			M, N,
+			np.asarray(A.indptr, dtype=idx_dtype),
+			np.asarray(A.indices, dtype=idx_dtype),
+			np.asarray(B.indptr, dtype=idx_dtype),
+			np.asarray(B.indices, dtype=idx_dtype),
+			minmax_topn)
+
+	else:
+		if n_jobs < 1:
+			err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!'
+			raise ValueError(err_str)
+
+		ct_thread.sparse_dot_only_max_nnz_col_threaded(
+			M, N,
+			np.asarray(A.indptr, dtype=idx_dtype),
+			np.asarray(A.indices, dtype=idx_dtype),
+			np.asarray(B.indptr, dtype=idx_dtype),
+			np.asarray(B.indices, dtype=idx_dtype),
+			minmax_topn, n_jobs)
+
+	return minmax_topn[0]
diff --git a/sparse_dot_topn/sparse_dot_topn.pyx b/sparse_dot_topn/sparse_dot_topn.pyx
index 9c35d3e9..b4e8463d 100644
--- a/sparse_dot_topn/sparse_dot_topn.pyx
+++ b/sparse_dot_topn/sparse_dot_topn.pyx
@@ -5,7 +5,7 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at#
-#    http://www.apache.org/licenses/LICENSE-2.0#
+#	http://www.apache.org/licenses/LICENSE-2.0#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -30,260 +30,260 @@ np.import_array()
 
 cdef extern from "sparse_dot_topn_source.h":
 
-    cdef void sparse_dot_topn_source(
-                                        int n_row,
-                                        int n_col,
-                                        int Ap[],
-                                        int Aj[],
-                                        double Ax[],
-                                        int Bp[],
-                                        int Bj[],
-                                        double Bx[],
-                                        int topn,
-                                        double lower_bound,
-                                        int Cp[],
-                                        int Cj[],
-                                        double Cx[]
-                                    );
-
-    cdef void sparse_dot_topn_extd_source(
-                                        int n_row,
-                                        int n_col,
-                                        int Ap[],
-                                        int Aj[],
-                                        double Ax[],
-                                        int Bp[],
-                                        int Bj[],
-                                        double Bx[],
-                                        int topn,
-                                        double lower_bound,
-                                        int Cp[],
-                                        int Cj[],
-                                        double Cx[],
-                                        int* nminmax
-                                    );
-
-    cdef void sparse_dot_free_source(
-                                        int n_row,
-                                        int n_col,
-                                        int Ap[],
-                                        int Aj[],
-                                        double Ax[],
-                                        int Bp[],
-                                        int Bj[],
-                                        double Bx[],
-                                        double lower_bound,
-                                        int Cp[],
-                                        vector[int]* Cj,
-                                        vector[double]* Cx,
-                                        int* n_minmax
-                                    );
-
-    cdef void sparse_dot_only_max_nnz_col_source(
-                                                    int n_row,
-                                                    int n_col,
-                                                    int Ap[],
-                                                    int Aj[],
-                                                    int Bp[],
-                                                    int Bj[],
-                                                    int* max_nnz_col
-                                                );
+	cdef void sparse_dot_topn_source(
+										int n_row,
+										int n_col,
+										int Ap[],
+										int Aj[],
+										double Ax[],
+										int Bp[],
+										int Bj[],
+										double Bx[],
+										int topn,
+										double lower_bound,
+										int Cp[],
+										int Cj[],
+										double Cx[]
+									);
+
+	cdef void sparse_dot_topn_extd_source(
+										int n_row,
+										int n_col,
+										int Ap[],
+										int Aj[],
+										double Ax[],
+										int Bp[],
+										int Bj[],
+										double Bx[],
+										int topn,
+										double lower_bound,
+										int Cp[],
+										int Cj[],
+										double Cx[],
+										int* nminmax
+									);
+
+	cdef void sparse_dot_free_source(
+										int n_row,
+										int n_col,
+										int Ap[],
+										int Aj[],
+										double Ax[],
+										int Bp[],
+										int Bj[],
+										double Bx[],
+										double lower_bound,
+										int Cp[],
+										vector[int]* Cj,
+										vector[double]* Cx,
+										int* n_minmax
+									);
+
+	cdef void sparse_dot_only_max_nnz_col_source(
+													int n_row,
+													int n_col,
+													int Ap[],
+													int Aj[],
+													int Bp[],
+													int Bj[],
+													int* max_nnz_col
+												);
 
 cpdef sparse_dot_topn(
-                        int n_row,
-                        int n_col,
-                        np.ndarray[int, ndim=1] a_indptr,
-                        np.ndarray[int, ndim=1] a_indices,
-                        np.ndarray[double, ndim=1] a_data,
-                        np.ndarray[int, ndim=1] b_indptr,
-                        np.ndarray[int, ndim=1] b_indices,
-                        np.ndarray[double, ndim=1] b_data,
-                        int ntop,
-                        double lower_bound,
-                        np.ndarray[int, ndim=1] c_indptr,
-                        np.ndarray[int, ndim=1] c_indices,
-                        np.ndarray[double, ndim=1] c_data
-                    ):
-    """
-    Cython glue function to call sparse_dot_topn C++ implementation
-    This function will return a matrix C in CSR format, where
-    C = [sorted top n results and results > lower_bound for each row of A * B]
-
-    Input:
-        n_row: number of rows of A matrix
-        n_col: number of columns of B matrix
-
-        a_indptr, a_indices, a_data: CSR expression of A matrix
-        b_indptr, b_indices, b_data: CSR expression of B matrix
-
-        ntop: n top results
-        lower_bound: a threshold that the element of A*B must greater than
-
-    Output by reference:
-        c_indptr, c_indices, c_data: CSR expression of C matrix
-
-    N.B. A and B must be CSR format!!!
-         The type of input numpy array must be aligned with types of C++ function arguments!
-    """
-
-    cdef int* Ap = &a_indptr[0]
-    cdef int* Aj = &a_indices[0]
-    cdef double* Ax = &a_data[0]
-    cdef int* Bp = &b_indptr[0]
-    cdef int* Bj = &b_indices[0]
-    cdef double* Bx = &b_data[0]
-    cdef int* Cp = &c_indptr[0]
-    cdef int* Cj = &c_indices[0]
-    cdef double* Cx = &c_data[0]
-
-    sparse_dot_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx)
-    return
+						int n_row,
+						int n_col,
+						np.ndarray[int, ndim=1] a_indptr,
+						np.ndarray[int, ndim=1] a_indices,
+						np.ndarray[double, ndim=1] a_data,
+						np.ndarray[int, ndim=1] b_indptr,
+						np.ndarray[int, ndim=1] b_indices,
+						np.ndarray[double, ndim=1] b_data,
+						int ntop,
+						double lower_bound,
+						np.ndarray[int, ndim=1] c_indptr,
+						np.ndarray[int, ndim=1] c_indices,
+						np.ndarray[double, ndim=1] c_data
+					):
+	"""
+	Cython glue function to call sparse_dot_topn C++ implementation
+	This function will return a matrix C in CSR format, where
+	C = [sorted top n results and results > lower_bound for each row of A * B]
+
+	Input:
+		n_row: number of rows of A matrix
+		n_col: number of columns of B matrix
+
+		a_indptr, a_indices, a_data: CSR expression of A matrix
+		b_indptr, b_indices, b_data: CSR expression of B matrix
+
+		ntop: n top results
+		lower_bound: a threshold that the element of A*B must greater than
+
+	Output by reference:
+		c_indptr, c_indices, c_data: CSR expression of C matrix
+
+	N.B. A and B must be CSR format!!!
+		 The type of input numpy array must be aligned with types of C++ function arguments!
+	"""
+
+	cdef int* Ap = &a_indptr[0]
+	cdef int* Aj = &a_indices[0]
+	cdef double* Ax = &a_data[0]
+	cdef int* Bp = &b_indptr[0]
+	cdef int* Bj = &b_indices[0]
+	cdef double* Bx = &b_data[0]
+	cdef int* Cp = &c_indptr[0]
+	cdef int* Cj = &c_indices[0]
+	cdef double* Cx = &c_data[0]
+
+	sparse_dot_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx)
+	return
 
 cpdef sparse_dot_topn_extd(
-                        int n_row,
-                        int n_col,
-                        np.ndarray[int, ndim=1] a_indptr,
-                        np.ndarray[int, ndim=1] a_indices,
-                        np.ndarray[double, ndim=1] a_data,
-                        np.ndarray[int, ndim=1] b_indptr,
-                        np.ndarray[int, ndim=1] b_indices,
-                        np.ndarray[double, ndim=1] b_data,
-                        int ntop,
-                        double lower_bound,
-                        np.ndarray[int, ndim=1] c_indptr,
-                        np.ndarray[int, ndim=1] c_indices,
-                        np.ndarray[double, ndim=1] c_data,
-                        np.ndarray[int, ndim=1] nminmax,
-                    ):
-    """
-    Cython glue function to call sparse_dot_topn C++ implementation
-    This function will return a matrix C in CSR format, where
-    C = [sorted top n results > lower_bound for each row of A * B]
-    The maximum number of elements per row of C nminmax is also returned.
-
-    Input:
-        n_row: number of rows of A matrix
-        n_col: number of columns of B matrix
-
-        a_indptr, a_indices, a_data: CSR expression of A matrix
-        b_indptr, b_indices, b_data: CSR expression of B matrix
-
-        ntop: n top results
-        lower_bound: a threshold that the element of A*B must greater than
-
-    Output by reference:
-        c_indptr, c_indices, c_data: CSR expression of C matrix
-        nminmax: The maximum number of elements per row of C
-
-    N.B. A and B must be CSR format!!!
-         The type of input numpy array must be aligned with types of C++ function arguments!
-    """
-
-    cdef int* Ap = &a_indptr[0]
-    cdef int* Aj = &a_indices[0]
-    cdef double* Ax = &a_data[0]
-    cdef int* Bp = &b_indptr[0]
-    cdef int* Bj = &b_indices[0]
-    cdef double* Bx = &b_data[0]
-    cdef int* Cp = &c_indptr[0]
-    cdef int* Cj = &c_indices[0]
-    cdef double* Cx = &c_data[0]
-    cdef int* n_minmax = &nminmax[0]
-
-    sparse_dot_topn_extd_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax)
-    return
+						int n_row,
+						int n_col,
+						np.ndarray[int, ndim=1] a_indptr,
+						np.ndarray[int, ndim=1] a_indices,
+						np.ndarray[double, ndim=1] a_data,
+						np.ndarray[int, ndim=1] b_indptr,
+						np.ndarray[int, ndim=1] b_indices,
+						np.ndarray[double, ndim=1] b_data,
+						int ntop,
+						double lower_bound,
+						np.ndarray[int, ndim=1] c_indptr,
+						np.ndarray[int, ndim=1] c_indices,
+						np.ndarray[double, ndim=1] c_data,
+						np.ndarray[int, ndim=1] nminmax,
+					):
+	"""
+	Cython glue function to call sparse_dot_topn C++ implementation
+	This function will return a matrix C in CSR format, where
+	C = [sorted top n results > lower_bound for each row of A * B]
+	The maximum number of elements per row of C nminmax is also returned.
+
+	Input:
+		n_row: number of rows of A matrix
+		n_col: number of columns of B matrix
+
+		a_indptr, a_indices, a_data: CSR expression of A matrix
+		b_indptr, b_indices, b_data: CSR expression of B matrix
+
+		ntop: n top results
+		lower_bound: a threshold that the element of A*B must greater than
+
+	Output by reference:
+		c_indptr, c_indices, c_data: CSR expression of C matrix
+		nminmax: The maximum number of elements per row of C
+
+	N.B. A and B must be CSR format!!!
+		 The type of input numpy array must be aligned with types of C++ function arguments!
+	"""
+
+	cdef int* Ap = &a_indptr[0]
+	cdef int* Aj = &a_indices[0]
+	cdef double* Ax = &a_data[0]
+	cdef int* Bp = &b_indptr[0]
+	cdef int* Bj = &b_indices[0]
+	cdef double* Bx = &b_data[0]
+	cdef int* Cp = &c_indptr[0]
+	cdef int* Cj = &c_indices[0]
+	cdef double* Cx = &c_data[0]
+	cdef int* n_minmax = &nminmax[0]
+
+	sparse_dot_topn_extd_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax)
+	return
 
 cpdef sparse_dot_free(
-                        int n_row,
-                        int n_col,
-                        np.ndarray[int, ndim=1] a_indptr,
-                        np.ndarray[int, ndim=1] a_indices,
-                        np.ndarray[double, ndim=1] a_data,
-                        np.ndarray[int, ndim=1] b_indptr,
-                        np.ndarray[int, ndim=1] b_indices,
-                        np.ndarray[double, ndim=1] b_data,
-                        double lower_bound,
-                        np.ndarray[int, ndim=1] c_indptr
-                    ):
-    """
-    Cython glue function to call sparse_dot_free C++ implementation
-    This function will return a matrix C in CSR format, where
-    C = [all results > lower_bound for each row of A * B]
-    This function lets C++ decide how to manage (grow/allocate/reallocate) memory for the 
-    storage of these results as needed during the computation; then hands over to numpy
-    a pointer to the memory location where the data resides  
-
-    Input:
-        n_row: number of rows of A matrix
-        n_col: number of columns of B matrix
-
-        a_indptr, a_indices, a_data: CSR expression of A matrix
-        b_indptr, b_indices, b_data: CSR expression of B matrix
-
-        lower_bound: a threshold that the element of A*B must greater than
-
-    Output by reference:
-        c_indptr, c_indices, c_data: CSR expression of C matrix
-
-    N.B. A and B must be CSR format!!!
-         The type of input numpy array must be aligned with types of C++ function arguments!
-    """
-
-    cdef int* Ap = &a_indptr[0]
-    cdef int* Aj = &a_indices[0]
-    cdef double* Ax = &a_data[0]
-    cdef int* Bp = &b_indptr[0]
-    cdef int* Bj = &b_indices[0]
-    cdef double* Bx = &b_data[0]
-    cdef int* Cp = &c_indptr[0]
-    cdef np.ndarray[int, ndim=1] nminmax = np.array([0], dtype=np.int32)
-    cdef int* n_minmax = &nminmax[0]
-    
-    cdef vector[int] vCj;
-    cdef vector[double] vCx;
-
-    sparse_dot_free_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx, n_minmax)
-    
-    c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0)
-    c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0)
-    
-    return c_indices, c_data, nminmax[0]
+						int n_row,
+						int n_col,
+						np.ndarray[int, ndim=1] a_indptr,
+						np.ndarray[int, ndim=1] a_indices,
+						np.ndarray[double, ndim=1] a_data,
+						np.ndarray[int, ndim=1] b_indptr,
+						np.ndarray[int, ndim=1] b_indices,
+						np.ndarray[double, ndim=1] b_data,
+						double lower_bound,
+						np.ndarray[int, ndim=1] c_indptr
+					):
+	"""
+	Cython glue function to call sparse_dot_free C++ implementation
+	This function will return a matrix C in CSR format, where
+	C = [all results > lower_bound for each row of A * B]
+	This function lets C++ decide how to manage (grow/allocate/reallocate) memory for the 
+	storage of these results as needed during the computation; then hands over to numpy
+	a pointer to the memory location where the data resides  
+
+	Input:
+		n_row: number of rows of A matrix
+		n_col: number of columns of B matrix
+
+		a_indptr, a_indices, a_data: CSR expression of A matrix
+		b_indptr, b_indices, b_data: CSR expression of B matrix
+
+		lower_bound: a threshold that the element of A*B must greater than
+
+	Output by reference:
+		c_indptr, c_indices, c_data: CSR expression of C matrix
+
+	N.B. A and B must be CSR format!!!
+		 The type of input numpy array must be aligned with types of C++ function arguments!
+	"""
+
+	cdef int* Ap = &a_indptr[0]
+	cdef int* Aj = &a_indices[0]
+	cdef double* Ax = &a_data[0]
+	cdef int* Bp = &b_indptr[0]
+	cdef int* Bj = &b_indices[0]
+	cdef double* Bx = &b_data[0]
+	cdef int* Cp = &c_indptr[0]
+	cdef np.ndarray[int, ndim=1] nminmax = np.array([0], dtype=np.int32)
+	cdef int* n_minmax = &nminmax[0]
+	
+	cdef vector[int] vCj;
+	cdef vector[double] vCx;
+
+	sparse_dot_free_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx, n_minmax)
+	
+	c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0)
+	c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0)
+	
+	return c_indices, c_data, nminmax[0]
 
 
 cpdef sparse_dot_only_max_nnz_col(
-                                    int n_row,
-                                    int n_col,
-                                    np.ndarray[int, ndim=1] a_indptr,
-                                    np.ndarray[int, ndim=1] a_indices,
-                                    np.ndarray[int, ndim=1] b_indptr,
-                                    np.ndarray[int, ndim=1] b_indices,
-                                    np.ndarray[int, ndim=1] minmax_topn
-                                ):
-    """
-    Cython glue function to call sparse_dot_only_minmax_topn C++ implementation
-    This function will return the maximum number of columns set
-    per row over all rows of A * B
-
-    Input:
-        n_row: number of rows of A matrix
-        n_col: number of columns of B matrix
-
-        a_indptr, a_indices: CSR indices of A matrix
-        b_indptr, b_indices: CSR indices of B matrix
-
-    Output by reference:
-        minmax_ntop: the maximum number of columns set per row over all rows of 
-                     A * B
-
-    N.B. A and B must be CSR format!!!
-         The type of input numpy array must be aligned with types of C++ function arguments!
-    """
-
-    cdef int* Ap = &a_indptr[0]
-    cdef int* Aj = &a_indices[0]
-    cdef int* Bp = &b_indptr[0]
-    cdef int* Bj = &b_indices[0]
-    cdef int* o_minmax_topn = &minmax_topn[0]
-
-    sparse_dot_only_max_nnz_col_source(n_row, n_col, Ap, Aj, Bp, Bj, o_minmax_topn)
-    return
+									int n_row,
+									int n_col,
+									np.ndarray[int, ndim=1] a_indptr,
+									np.ndarray[int, ndim=1] a_indices,
+									np.ndarray[int, ndim=1] b_indptr,
+									np.ndarray[int, ndim=1] b_indices,
+									np.ndarray[int, ndim=1] minmax_topn
+								):
+	"""
+	Cython glue function to call sparse_dot_only_minmax_topn C++ implementation
+	This function will return the maximum number of columns set
+	per row over all rows of A * B
+
+	Input:
+		n_row: number of rows of A matrix
+		n_col: number of columns of B matrix
+
+		a_indptr, a_indices: CSR indices of A matrix
+		b_indptr, b_indices: CSR indices of B matrix
+
+	Output by reference:
+		minmax_ntop: the maximum number of columns set per row over all rows of 
+					 A * B
+
+	N.B. A and B must be CSR format!!!
+		 The type of input numpy array must be aligned with types of C++ function arguments!
+	"""
+
+	cdef int* Ap = &a_indptr[0]
+	cdef int* Aj = &a_indices[0]
+	cdef int* Bp = &b_indptr[0]
+	cdef int* Bj = &b_indices[0]
+	cdef int* o_minmax_topn = &minmax_topn[0]
+
+	sparse_dot_only_max_nnz_col_source(n_row, n_col, Ap, Aj, Bp, Bj, o_minmax_topn)
+	return
diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/sparse_dot_topn/sparse_dot_topn_parallel.cpp
index d941248e..fa37746f 100644
--- a/sparse_dot_topn/sparse_dot_topn_parallel.cpp
+++ b/sparse_dot_topn/sparse_dot_topn_parallel.cpp
@@ -6,7 +6,7 @@
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ *	http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -30,81 +30,85 @@
 #include "./sparse_dot_topn_source.h"
 #include "./sparse_dot_topn_parallel.h"
 
+struct job_range_type {int begin; int end;};
 
 void distribute_load(
 		int load_sz,
 		int n_jobs,
-		std::vector<std::vector<int>> &ranges
+		std::vector<job_range_type> &ranges
 )
 {
-    // share the load among jobs:
-    int equal_job_load_sz = load_sz/n_jobs;
+	// share the load among jobs:
+	int equal_job_load_sz = load_sz/n_jobs;
 	int rem = load_sz % n_jobs;
 	ranges.resize(n_jobs);
 
-    int start = 0;
+	int start = 0;
 	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
-	    std::vector<int> temp_vector(2, 0);
 
-	    temp_vector[0] = start;
-	    temp_vector[1] = start + equal_job_load_sz + ((job_nr < rem)? 1 : 0);
-	    start = temp_vector[1];
-
-	    ranges[job_nr] = temp_vector;
+		ranges[job_nr].begin = start;
+		ranges[job_nr].end = start + equal_job_load_sz + ((job_nr < rem)? 1 : 0);
+		start = ranges[job_nr].end;
 	}
 }
 
 void inner_gather_function(
-		int start_row,
-		int end_row,
+		job_range_type job_range,
 		int Cp[],
 		int Cp_start,
 		int vCj_start[],
 		double vCx_start[],
-		std::vector<candidate> real_candidates[]
+		std::vector<candidate>* real_candidates,
+		std::vector<int>* row_sizes
 )
 {
-	int Cp_i = Cp_start;
+	candidate* c = real_candidates->data();
 	int* vCj_cursor = &vCj_start[Cp_start];
 	double* vCx_cursor = &vCx_start[Cp_start];
-	candidate c;
-	for (int i = start_row; i < end_row; i++){
-		Cp_i += (int) real_candidates[i].size();
-		Cp[i + 1] = Cp_i;
-		for (unsigned int j = 0; j < real_candidates[i].size(); j++){
-			c = real_candidates[i][j];
-			*(vCj_cursor++) = c.index;
-			*(vCx_cursor++) = c.value;
+
+	int Cp_i = Cp_start;
+	int* row_sizes_ptr = row_sizes->data();
+
+	for (int i = job_range.begin; i < job_range.end; i++){
+		for (int j = 0; j < (*row_sizes_ptr); j++){
+			*(vCj_cursor++) = c->index;
+			*(vCx_cursor++) = (c++)->value;
 		}
-		real_candidates[i].clear();
+		Cp_i += *(row_sizes_ptr++);
+		Cp[i + 1] = Cp_i;
 	}
+	real_candidates->clear();
 }
 
 void inner_sparse_dot_topn(
-		int start_row,
-		int end_row,
+		job_range_type job_range,
 		int n_col_inner,
-        int ntop_inner,
+		int ntop_inner,
 		double lower_bound_inner,
 		int Ap_copy[],
-        int Aj_copy[],
+		int Aj_copy[],
 		double Ax_copy[],
 		int Bp_copy[],
 		int Bj_copy[],
-        double Bx_copy[],
-		std::vector<candidate> real_candidates[],
+		double Bx_copy[],
+		std::vector<candidate>* real_candidates,
+		std::vector<int>* row_sizes,
 		int* total
 )
 {
 	std::vector<int> next(n_col_inner,-1);
 	std::vector<double> sums(n_col_inner, 0);
 
-	std::vector<candidate> temp_candidates;
+	real_candidates->reserve(job_range.end - job_range.begin);
 
-	for(int i = start_row; i < end_row; i++){
+	row_sizes->resize(job_range.end - job_range.begin);
+	int* row_sizes_ptr = row_sizes->data();
+
+	for (int i = job_range.begin; i < job_range.end; i++){
 
 		int head   = -2;
 		int length =  0;
+		size_t sz = real_candidates->size();
 
 		int jj_start = Ap_copy[i];
 		int jj_end   = Ap_copy[i+1];
@@ -134,7 +138,7 @@ void inner_sparse_dot_topn(
 				candidate c;
 				c.index = head;
 				c.value = sums[head];
-				temp_candidates.push_back(c);
+				real_candidates->push_back(c);
 			}
 
 			int temp = head;
@@ -144,24 +148,31 @@ void inner_sparse_dot_topn(
 			sums[temp] =  0; //clear arrays
 		}
 
-		int len = (int)temp_candidates.size();
+		int len = (int) (real_candidates->size() - sz);
+
+		candidate* candidate_arr_begin = real_candidates->data() + sz;
 		if (len > ntop_inner){
-			std::partial_sort(temp_candidates.begin(),
-								temp_candidates.begin()+ntop_inner,
-								temp_candidates.end(),
-								candidate_cmp);
+			std::partial_sort(
+					candidate_arr_begin,
+					candidate_arr_begin + ntop_inner,
+					candidate_arr_begin + len,
+					candidate_cmp
+			);
 			len = ntop_inner;
 		}
 		else {
-			std::sort(temp_candidates.begin(),
-						temp_candidates.end(), candidate_cmp);
+			std::sort(
+					candidate_arr_begin,
+					candidate_arr_begin + len,
+					candidate_cmp
+			);
 		}
 
+		real_candidates->resize(sz + (size_t) len);
+		*(row_sizes_ptr++) = len;
 		(*total) += len;
-		temp_candidates.resize(len);
-		real_candidates[i].swap(temp_candidates);
-		real_candidates[i].shrink_to_fit();
 	}
+	real_candidates->shrink_to_fit();
 }
 
 void sparse_dot_topn_parallel(
@@ -181,13 +192,11 @@ void sparse_dot_topn_parallel(
 		int n_jobs
 )
 {
-	std::vector<std::vector<candidate>> real_candidates(n_row);
-	std::vector<candidate> *real_cand_pointer;
-	real_cand_pointer = &real_candidates[0];
-
+	std::vector<job_range_type> job_ranges(n_jobs);
+	distribute_load(n_row, n_jobs, job_ranges);
 
-	std::vector<std::vector<int>> split_row_vector(n_jobs);
-    distribute_load(n_row, n_jobs, split_row_vector);
+	std::vector<std::vector<candidate> > real_candidates(n_jobs);
+	std::vector<std::vector<int>> row_sizes(n_jobs);
 
 	// initialize aggregate:
 	std::vector<int> sub_total(n_jobs, 0);
@@ -195,53 +204,48 @@ void sparse_dot_topn_parallel(
 	std::vector<std::thread> thread_list(n_jobs);
 	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
 
-	    int start_row = split_row_vector[job_nr][0];
-	    int end_row = split_row_vector[job_nr][1];
-
-	    thread_list[job_nr] = std::thread(
-	    		inner_sparse_dot_topn,
-				start_row, end_row,
+		thread_list[job_nr] = std::thread(
+				inner_sparse_dot_topn,
+				job_ranges[job_nr],
 				n_col, ntop,
 				lower_bound,
 				Ap, Aj, Ax, Bp, Bj, Bx,
-				real_cand_pointer,
+				&real_candidates[job_nr],
+				&row_sizes[job_nr],
 				&sub_total[job_nr]
 		);
-    }
+	}
 
-    for (int job_nr = 0; job_nr < n_jobs; job_nr++)
-    	thread_list[job_nr].join();
+	for (int job_nr = 0; job_nr < n_jobs; job_nr++)
+		thread_list[job_nr].join();
 
-    // gather the results:
-    std::vector<int> start_points(n_jobs + 1);
-    start_points[0] = 0;
-    std::partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1);
+	// gather the results:
+	std::vector<int> start_points(n_jobs + 1);
+	start_points[0] = 0;
+	partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1);
 
-    Cp[0] = 0;
+	Cp[0] = 0;
 	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
 
-	    int start_row = split_row_vector[job_nr][0];
-	    int end_row = split_row_vector[job_nr][1];
-
-	    thread_list[job_nr] = std::thread(
-	    		inner_gather_function,
-	    		start_row, end_row,
-	    		Cp,
+		thread_list[job_nr] = std::thread(
+				inner_gather_function,
+				job_ranges[job_nr],
+				Cp,
 				start_points[job_nr],
 				Cj,
 				Cx,
-				real_cand_pointer
+				&real_candidates[job_nr],
+				&row_sizes[job_nr]
 		);
-    }
+	}
 
-    for (int job_nr = 0; job_nr < n_jobs; job_nr++)
-    	thread_list[job_nr].join();
+	for (int job_nr = 0; job_nr < n_jobs; job_nr++)
+		thread_list[job_nr].join();
 
 }
 
 void inner_sparse_dot_topn_extd(
-		int start_row,
-		int end_row,
+		job_range_type job_range,
 		int n_col_inner,
 		int ntop_inner,
 		double lower_bound_inner,
@@ -251,25 +255,25 @@ void inner_sparse_dot_topn_extd(
 		int Bp_copy[],
 		int Bj_copy[],
 		double Bx_copy[],
-		std::vector<candidate> real_candidates[],
+		std::vector<candidate>* real_candidates,
+		std::vector<int>* row_sizes,
 		int* total,
 		int* n_minmax
 )
 {
-
 	std::vector<int> next(n_col_inner,-1);
 	std::vector<double> sums(n_col_inner, 0);
 
-	std::vector<candidate> temp_candidates;
+	real_candidates->reserve(job_range.end - job_range.begin);
 
-	int iterations_count = 0;
+	row_sizes->resize(job_range.end - job_range.begin);
+	int* row_sizes_ptr = row_sizes->data();
 
-	for(int i = start_row; i < end_row; i++){
-
-		iterations_count += 1;
+	for(int i = job_range.begin; i < job_range.end; i++){
 
 		int head   = -2;
 		int length =  0;
+		size_t sz = real_candidates->size();
 
 		int jj_start = Ap_copy[i];
 		int jj_end   = Ap_copy[i+1];
@@ -299,7 +303,7 @@ void inner_sparse_dot_topn_extd(
 				candidate c;
 				c.index = head;
 				c.value = sums[head];
-				temp_candidates.push_back(c);
+				real_candidates->push_back(c);
 			}
 
 			int temp = head;
@@ -309,25 +313,32 @@ void inner_sparse_dot_topn_extd(
 			sums[temp] =  0; //clear arrays
 		}
 
-		int len = (int)temp_candidates.size();
+		int len = (int) (real_candidates->size() - sz);
 		*n_minmax = (len > *n_minmax)? len : *n_minmax;
+
+		candidate* candidate_arr_begin = real_candidates->data() + sz;
 		if (len > ntop_inner){
-			std::partial_sort(temp_candidates.begin(),
-								temp_candidates.begin()+ntop_inner,
-								temp_candidates.end(),
-								candidate_cmp);
+			std::partial_sort(
+					candidate_arr_begin,
+					candidate_arr_begin + ntop_inner,
+					candidate_arr_begin + len,
+					candidate_cmp
+			);
 			len = ntop_inner;
 		}
 		else {
-			std::sort(temp_candidates.begin(),
-						temp_candidates.end(), candidate_cmp);
+			std::sort(
+					candidate_arr_begin,
+					candidate_arr_begin + len,
+					candidate_cmp
+			);
 		}
 
+		real_candidates->resize(sz + (size_t) len);
+		*(row_sizes_ptr++) = len;
 		(*total) += len;
-		temp_candidates.resize(len);
-		real_candidates[i].swap(temp_candidates);
-		real_candidates[i].shrink_to_fit();
 	}
+	real_candidates->shrink_to_fit();
 }
 
 void sparse_dot_topn_extd_parallel(
@@ -348,94 +359,92 @@ void sparse_dot_topn_extd_parallel(
 		int n_jobs
 )
 {
-	std::vector<std::vector<int>> split_row_vector(n_jobs);
-    distribute_load(n_row, n_jobs, split_row_vector);
+	std::vector<job_range_type> job_ranges(n_jobs);
+	distribute_load(n_row, n_jobs, job_ranges);
 
-	std::vector<std::vector<candidate>> real_candidates(n_row);
-	std::vector<candidate> *real_cand_pointer;
-	real_cand_pointer = &real_candidates[0];
+	std::vector<std::vector<candidate> > real_candidates(n_jobs);
+	std::vector<std::vector<int>> row_sizes(n_jobs);
 
 	// initialize aggregates:
 	std::vector<int> sub_total(n_jobs, 0);
-    std::vector<int> split_n_minmax(n_jobs, 0);
+	std::vector<int> split_n_minmax(n_jobs, 0);
 
-    std::vector<std::thread> thread_list(n_jobs);
+	std::vector<std::thread> thread_list(n_jobs);
 
 	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
 
-	    int start_row = split_row_vector[job_nr][0];
-	    int end_row = split_row_vector[job_nr][1];
-
-	    thread_list[job_nr] = std::thread(
-	    		inner_sparse_dot_topn_extd,
-				start_row, end_row,
+		thread_list[job_nr] = std::thread(
+				inner_sparse_dot_topn_extd,
+				job_ranges[job_nr],
 				n_col, ntop,
 				lower_bound,
 				Ap, Aj, Ax, Bp, Bj, Bx,
-				real_cand_pointer,
+				&real_candidates[job_nr],
+				&row_sizes[job_nr],
 				&sub_total[job_nr],
 				&split_n_minmax[job_nr]
 		);
-    }
+	}
 
-    for (int job_nr = 0; job_nr < n_jobs; job_nr++)
-    	thread_list[job_nr].join();
+	for (int job_nr = 0; job_nr < n_jobs; job_nr++)
+		thread_list[job_nr].join();
 
-    // gather the results:
-    *n_minmax = *std::max_element(split_n_minmax.begin(), split_n_minmax.end());
+	// gather the results:
+	*n_minmax = *max_element(split_n_minmax.begin(), split_n_minmax.end());
 
-    std::vector<int> start_points(n_jobs + 1);
-    start_points[0] = 0;
-    std::partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1);
+	std::vector<int> start_points(n_jobs + 1);
+	start_points[0] = 0;
+	partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1);
 
-    Cp[0] = 0;
+	Cp[0] = 0;
 	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
 
-	    int start_row = split_row_vector[job_nr][0];
-	    int end_row = split_row_vector[job_nr][1];
-
-	    thread_list[job_nr] = std::thread(
-	    		inner_gather_function,
-	    		start_row, end_row,
-	    		Cp,
+		thread_list[job_nr] = std::thread(
+				inner_gather_function,
+				job_ranges[job_nr],
+				Cp,
 				start_points[job_nr],
 				Cj,
 				Cx,
-				real_cand_pointer
+				&real_candidates[job_nr],
+				&row_sizes[job_nr]
 		);
-    }
+	}
 
-    for (int job_nr = 0; job_nr < n_jobs; job_nr++)
-    	thread_list[job_nr].join();
+	for (int job_nr = 0; job_nr < n_jobs; job_nr++)
+		thread_list[job_nr].join();
 
 }
 
 void inner_sparse_dot_free(
-		int start_row,
-		int end_row,
+		job_range_type job_range,
 		int n_col_inner,
-        double lower_bound_inner,
+		double lower_bound_inner,
 		int Ap_copy[],
-        int Aj_copy[],
+		int Aj_copy[],
 		double Ax_copy[],
 		int Bp_copy[],
 		int Bj_copy[],
-        double Bx_copy[],
-		std::vector<candidate> real_candidates[],
+		double Bx_copy[],
+		std::vector<candidate>* real_candidates,
+		std::vector<int>* row_sizes,
 		int* total,
 		int* n_minmax
 )
 {
-
 	std::vector<int> next(n_col_inner,-1);
 	std::vector<double> sums(n_col_inner, 0);
 
-	std::vector<candidate> temp_candidates;
+	real_candidates->reserve(job_range.end - job_range.begin);
+
+	row_sizes->resize(job_range.end - job_range.begin);
+	int* row_sizes_ptr = row_sizes->data();
 
-	for(int i = start_row; i < end_row; i++){
+	for(int i = job_range.begin; i < job_range.end; i++){
 
 		int head   = -2;
 		int length =  0;
+		size_t sz = real_candidates->size();
 
 		int jj_start = Ap_copy[i];
 		int jj_end   = Ap_copy[i+1];
@@ -465,7 +474,7 @@ void inner_sparse_dot_free(
 				candidate c;
 				c.index = head;
 				c.value = sums[head];
-				temp_candidates.push_back(c);
+				real_candidates->push_back(c);
 			}
 
 			int temp = head;
@@ -475,16 +484,21 @@ void inner_sparse_dot_free(
 			sums[temp] =  0; //clear arrays
 		}
 
+		int len = (int) (real_candidates->size() - sz);
 
-		std::sort(temp_candidates.begin(),
-					temp_candidates.end(), candidate_cmp);
+		candidate* candidate_arr_begin = real_candidates->data() + sz;
+		std::sort(
+				candidate_arr_begin,
+				candidate_arr_begin + len,
+				candidate_cmp
+		);
 
-		int len = (int) temp_candidates.size();
+		real_candidates->resize(sz + (size_t) len);
+		*(row_sizes_ptr++) = len;
 		(*total) += len;
 		*n_minmax = (len > *n_minmax)? len : *n_minmax;
-		real_candidates[i].swap(temp_candidates);
-		real_candidates[i].shrink_to_fit();
 	}
+	real_candidates->shrink_to_fit();
 }
 
 void sparse_dot_free_parallel(
@@ -504,75 +518,71 @@ void sparse_dot_free_parallel(
 		int n_jobs
 )
 {
-	std::vector<std::vector<int>> split_row_vector(n_jobs);
-    distribute_load(n_row, n_jobs, split_row_vector);
+	std::vector<job_range_type> job_ranges(n_jobs);
+	distribute_load(n_row, n_jobs, job_ranges);
 
-	std::vector<std::vector<candidate>> real_candidates(n_row);
-	std::vector<candidate> *real_cand_pointer;
-	real_cand_pointer = &real_candidates[0];
+	std::vector<std::vector<candidate> > real_candidates(n_jobs);
+	std::vector<std::vector<int>> row_sizes(n_jobs);
 
 	// initialize aggregates:
 	std::vector<int> sub_total(n_jobs, 0);
-    std::vector<int> split_n_minmax(n_jobs, 0);
+	std::vector<int> split_n_minmax(n_jobs, 0);
 
-    // execute the jobs:
+	// execute the jobs:
 	std::vector<std::thread> thread_list(n_jobs);
 	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
 
-	    int start_row = split_row_vector[job_nr][0];
-	    int end_row = split_row_vector[job_nr][1];
-
-	    thread_list[job_nr] = std::thread (
-	    		inner_sparse_dot_free,
-	    		start_row, end_row,
+		thread_list[job_nr] = std::thread (
+				inner_sparse_dot_free,
+				job_ranges[job_nr],
 				n_col,
 				lower_bound,
-	            Ap, Aj, Ax, Bp, Bj, Bx,
-	            real_cand_pointer,
+				Ap, Aj, Ax, Bp, Bj, Bx,
+				&real_candidates[job_nr],
+				&row_sizes[job_nr],
 				&sub_total[job_nr],
 				&split_n_minmax[job_nr]
 		);
-    }
+	}
 
-    for (int job_nr = 0; job_nr < n_jobs; job_nr++)
-    	thread_list[job_nr].join();
+	for (int job_nr = 0; job_nr < n_jobs; job_nr++)
+		thread_list[job_nr].join();
 
-    // gather the results (in parallel):
-    *n_minmax = *std::max_element(split_n_minmax.begin(), split_n_minmax.end());
+	// gather the results (in parallel):
+	*n_minmax = *std::max_element(split_n_minmax.begin(), split_n_minmax.end());
 
-    std::vector<int> start_points(n_jobs + 1);
-    start_points[0] = 0;
-    std::partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1);
+	std::vector<int> start_points(n_jobs + 1);
+	start_points[0] = 0;
+	std::partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1);
 
-    int total = start_points.back();
-    vCj->resize(total);
-    vCx->resize(total);
+	int total = start_points.back();
+	vCj->resize(total);
+	vCj->shrink_to_fit();
+	vCx->resize(total);
+	vCx->shrink_to_fit();
 
-    Cp[0] = 0;
+	Cp[0] = 0;
 	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
 
-	    int start_row = split_row_vector[job_nr][0];
-	    int end_row = split_row_vector[job_nr][1];
-
-	    thread_list[job_nr] = std::thread(
-	    		inner_gather_function,
-	    		start_row, end_row,
-	    		Cp,
+		thread_list[job_nr] = std::thread(
+				inner_gather_function,
+				job_ranges[job_nr],
+				Cp,
 				start_points[job_nr],
 				&((*vCj)[0]),
 				&((*vCx)[0]),
-				real_cand_pointer
+				&real_candidates[job_nr],
+				&row_sizes[job_nr]
 		);
-    }
+	}
 
-    for (int job_nr = 0; job_nr < n_jobs; job_nr++)
-    	thread_list[job_nr].join();
+	for (int job_nr = 0; job_nr < n_jobs; job_nr++)
+		thread_list[job_nr].join();
 
 }
 
 void inner_sparse_only_max_nnz_col(
-		int start_row,
-		int end_row,
+		job_range_type job_range,
 		int n_col_inner,
 		int Ap_copy[],
 		int Aj_copy[],
@@ -583,7 +593,7 @@ void inner_sparse_only_max_nnz_col(
 {
 	std::vector<bool> unmarked(n_col_inner, true);
 
-	for(int i = start_row; i < end_row; i++){
+	for(int i = job_range.begin; i < job_range.end; i++){
 
 		int length =  0;
 
@@ -619,29 +629,25 @@ void sparse_dot_only_max_nnz_col_parallel(
 		int n_jobs
 )
 {
-	std::vector<std::vector<int>> split_row_vector(n_jobs);
-    distribute_load(n_row, n_jobs, split_row_vector);
+	std::vector<job_range_type> job_ranges(n_jobs);
+	distribute_load(n_row, n_jobs, job_ranges);
 
-    std::vector<int> split_max_nnz_col(n_jobs, 0);
-    std::vector<std::thread> thread_list(n_jobs);
+	std::vector<int> split_max_nnz_col(n_jobs, 0);
+	std::vector<std::thread> thread_list(n_jobs);
 	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
 
-	    int start_row = split_row_vector[job_nr][0];
-	    int end_row = split_row_vector[job_nr][1];
-
-	    thread_list[job_nr] = std::thread (
-	    		inner_sparse_only_max_nnz_col,
-	    		start_row, end_row,
+		thread_list[job_nr] = std::thread (
+				inner_sparse_only_max_nnz_col,
+				job_ranges[job_nr],
 				n_col,
 				Ap, Aj, Bp, Bj,
 				&split_max_nnz_col[job_nr]
 		);
 
-    }
+	}
 
-    for (int job_nr = 0; job_nr < n_jobs; job_nr++)
-    	thread_list[job_nr].join();
+	for (int job_nr = 0; job_nr < n_jobs; job_nr++)
+		thread_list[job_nr].join();
 
-    *max_nnz_col = *std::max_element(split_max_nnz_col.begin(), split_max_nnz_col.end());
+	*max_nnz_col = *max_element(split_max_nnz_col.begin(), split_max_nnz_col.end());
 }
-
diff --git a/sparse_dot_topn/sparse_dot_topn_source.cpp b/sparse_dot_topn/sparse_dot_topn_source.cpp
index 88abbd6a..f0400f0e 100644
--- a/sparse_dot_topn/sparse_dot_topn_source.cpp
+++ b/sparse_dot_topn/sparse_dot_topn_source.cpp
@@ -6,7 +6,7 @@
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ *	http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -29,25 +29,25 @@
 bool candidate_cmp(candidate c_i, candidate c_j) { return (c_i.value > c_j.value); }
 
 /*
-    C++ implementation of sparse_dot_topn
+	C++ implementation of sparse_dot_topn
 
-    This function will return a matrix C in CSR format, where
-    C = [sorted top n results > lower_bound for each row of A * B]
+	This function will return a matrix C in CSR format, where
+	C = [sorted top n results > lower_bound for each row of A * B]
 
-    Input:
-        n_row: number of rows of A matrix
-        n_col: number of columns of B matrix
+	Input:
+		n_row: number of rows of A matrix
+		n_col: number of columns of B matrix
 
-        Ap, Aj, Ax: CSR expression of A matrix
-        Bp, Bj, Bx: CSR expression of B matrix
+		Ap, Aj, Ax: CSR expression of A matrix
+		Bp, Bj, Bx: CSR expression of B matrix
 
-        ntop: n top results
-        lower_bound: a threshold that the element of A*B must greater than
+		ntop: n top results
+		lower_bound: a threshold that the element of A*B must greater than
 
-    Output by reference:
-        Cp, Cj, Cx: CSR expression of C matrix
+	Output by reference:
+		Cp, Cj, Cx: CSR expression of C matrix
 
-    N.B. A and B must be CSR format!!!
+	N.B. A and B must be CSR format!!!
 */
 void sparse_dot_topn_source(
 		int n_row,
@@ -65,98 +65,98 @@ void sparse_dot_topn_source(
 		double Cx[]
 )
 {
-    std::vector<int> next(n_col,-1);
-    std::vector<double> sums(n_col, 0);
-
-    std::vector<candidate> candidates;
-
-    int nnz = 0;
-
-    Cp[0] = 0;
-
-    for(int i = 0; i < n_row; i++){
-        int head   = -2;
-        int length =  0;
-
-        int jj_start = Ap[i];
-        int jj_end   = Ap[i+1];
-        for(int jj = jj_start; jj < jj_end; jj++){
-            int j = Aj[jj];
-            double v = Ax[jj]; //value of A in (i,j)
-
-            int kk_start = Bp[j];
-            int kk_end   = Bp[j+1];
-            for(int kk = kk_start; kk < kk_end; kk++){
-                int k = Bj[kk]; //kth column of B in row j
-
-                sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
-
-                if(next[k] == -1){
-                    next[k] = head; //keep a linked list, every element points to the next column index
-                    head  = k;
-                    length++;
-                }
-            }
-        }
-
-        for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
-
-            if(sums[head] > lower_bound){ //append the nonzero elements
-                candidate c;
-                c.index = head;
-                c.value = sums[head];
-                candidates.push_back(c);
-            }
-
-            int temp = head;
-            head = next[head]; //iterate over columns
-
-            next[temp] = -1; //clear arrays
-            sums[temp] =  0; //clear arrays
-        }
-
-        int len = (int)candidates.size();
-        if (len > ntop){
-            std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp);
-            len = ntop;
-        } else {
-            std::sort(candidates.begin(), candidates.end(), candidate_cmp);
-        }
-
-        for(int a=0; a < len; a++){
-            Cj[nnz] = candidates[a].index;
-            Cx[nnz] = candidates[a].value;
-            nnz++;
-        }
-        candidates.clear();
-
-        Cp[i+1] = nnz;
-    }
+	std::vector<int> next(n_col,-1);
+	std::vector<double> sums(n_col, 0);
+
+	std::vector<candidate> candidates;
+
+	int nnz = 0;
+
+	Cp[0] = 0;
+
+	for(int i = 0; i < n_row; i++){
+		int head   = -2;
+		int length =  0;
+
+		int jj_start = Ap[i];
+		int jj_end   = Ap[i+1];
+		for(int jj = jj_start; jj < jj_end; jj++){
+			int j = Aj[jj];
+			double v = Ax[jj]; //value of A in (i,j)
+
+			int kk_start = Bp[j];
+			int kk_end   = Bp[j+1];
+			for(int kk = kk_start; kk < kk_end; kk++){
+				int k = Bj[kk]; //kth column of B in row j
+
+				sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
+
+				if(next[k] == -1){
+					next[k] = head; //keep a linked list, every element points to the next column index
+					head  = k;
+					length++;
+				}
+			}
+		}
+
+		for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
+
+			if(sums[head] > lower_bound){ //append the nonzero elements
+				candidate c;
+				c.index = head;
+				c.value = sums[head];
+				candidates.push_back(c);
+			}
+
+			int temp = head;
+			head = next[head]; //iterate over columns
+
+			next[temp] = -1; //clear arrays
+			sums[temp] =  0; //clear arrays
+		}
+
+		int len = (int)candidates.size();
+		if (len > ntop){
+			std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp);
+			len = ntop;
+		} else {
+			std::sort(candidates.begin(), candidates.end(), candidate_cmp);
+		}
+
+		for(int a=0; a < len; a++){
+			Cj[nnz] = candidates[a].index;
+			Cx[nnz] = candidates[a].value;
+			nnz++;
+		}
+		candidates.clear();
+
+		Cp[i+1] = nnz;
+	}
 }
 
 /*
-    C++ implementation of sparse_dot_topn_extd_source
+	C++ implementation of sparse_dot_topn_extd_source
 
-    This function will return a matrix C in CSR format, where
-    C = [sorted top n results > lower_bound for each row of A * B].
-    The maximum number n_minmax of elements per row of C (assuming ntop = n_col)
-    is also returned.
+	This function will return a matrix C in CSR format, where
+	C = [sorted top n results > lower_bound for each row of A * B].
+	The maximum number n_minmax of elements per row of C (assuming ntop = n_col)
+	is also returned.
 
-    Input:
-        n_row: number of rows of A matrix
-        n_col: number of columns of B matrix
+	Input:
+		n_row: number of rows of A matrix
+		n_col: number of columns of B matrix
 
-        Ap, Aj, Ax: CSR expression of A matrix
-        Bp, Bj, Bx: CSR expression of B matrix
+		Ap, Aj, Ax: CSR expression of A matrix
+		Bp, Bj, Bx: CSR expression of B matrix
 
-        ntop: n top results
-        lower_bound: a threshold that the element of A*B must greater than
+		ntop: n top results
+		lower_bound: a threshold that the element of A*B must greater than
 
-    Output by reference:
-        Cp, Cj, Cx: CSR expression of C matrix
-        n_minmax: The maximum number of elements per row of C (assuming ntop = n_col)
+	Output by reference:
+		Cp, Cj, Cx: CSR expression of C matrix
+		n_minmax: The maximum number of elements per row of C (assuming ntop = n_col)
 
-    N.B. A and B must be CSR format!!!
+	N.B. A and B must be CSR format!!!
 */
 void sparse_dot_topn_extd_source(
 		int n_row,
@@ -175,101 +175,101 @@ void sparse_dot_topn_extd_source(
 		int* n_minmax
 )
 {
-    std::vector<int> next(n_col,-1);
-    std::vector<double> sums(n_col, 0);
-
-    std::vector<candidate> candidates;
-
-    int nnz = 0;
-
-    Cp[0] = 0;
-    *n_minmax = 0;
-
-    for(int i = 0; i < n_row; i++){
-        int head   = -2;
-        int length =  0;
-
-        int jj_start = Ap[i];
-        int jj_end   = Ap[i+1];
-        for(int jj = jj_start; jj < jj_end; jj++){
-            int j = Aj[jj];
-            double v = Ax[jj]; //value of A in (i,j)
-
-            int kk_start = Bp[j];
-            int kk_end   = Bp[j+1];
-            for(int kk = kk_start; kk < kk_end; kk++){
-                int k = Bj[kk]; //kth column of B in row j
-
-                sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
-
-                if(next[k] == -1){
-                    next[k] = head; //keep a linked list, every element points to the next column index
-                    head  = k;
-                    length++;
-                }
-            }
-        }
-
-        for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
-
-            if(sums[head] > lower_bound){ //append the nonzero elements
-                candidate c;
-                c.index = head;
-                c.value = sums[head];
-                candidates.push_back(c);
-            }
-
-            int temp = head;
-            head = next[head]; //iterate over columns
-
-            next[temp] = -1; //clear arrays
-            sums[temp] =  0; //clear arrays
-        }
-
-        int len = (int)candidates.size();
-        *n_minmax = (len > *n_minmax)? len : *n_minmax;
-        if (len > ntop){
-            std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp);
-            len = ntop;
-        } else {
-            std::sort(candidates.begin(), candidates.end(), candidate_cmp);
-        }
-
-        for(int a=0; a < len; a++){
-            Cj[nnz] = candidates[a].index;
-            Cx[nnz] = candidates[a].value;
-            nnz++;
-        }
-        candidates.clear();
-
-        Cp[i+1] = nnz;
-    }
+	std::vector<int> next(n_col,-1);
+	std::vector<double> sums(n_col, 0);
+
+	std::vector<candidate> candidates;
+
+	int nnz = 0;
+
+	Cp[0] = 0;
+	*n_minmax = 0;
+
+	for(int i = 0; i < n_row; i++){
+		int head   = -2;
+		int length =  0;
+
+		int jj_start = Ap[i];
+		int jj_end   = Ap[i+1];
+		for(int jj = jj_start; jj < jj_end; jj++){
+			int j = Aj[jj];
+			double v = Ax[jj]; //value of A in (i,j)
+
+			int kk_start = Bp[j];
+			int kk_end   = Bp[j+1];
+			for(int kk = kk_start; kk < kk_end; kk++){
+				int k = Bj[kk]; //kth column of B in row j
+
+				sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
+
+				if(next[k] == -1){
+					next[k] = head; //keep a linked list, every element points to the next column index
+					head  = k;
+					length++;
+				}
+			}
+		}
+
+		for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
+
+			if(sums[head] > lower_bound){ //append the nonzero elements
+				candidate c;
+				c.index = head;
+				c.value = sums[head];
+				candidates.push_back(c);
+			}
+
+			int temp = head;
+			head = next[head]; //iterate over columns
+
+			next[temp] = -1; //clear arrays
+			sums[temp] =  0; //clear arrays
+		}
+
+		int len = (int)candidates.size();
+		*n_minmax = (len > *n_minmax)? len : *n_minmax;
+		if (len > ntop){
+			std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp);
+			len = ntop;
+		} else {
+			std::sort(candidates.begin(), candidates.end(), candidate_cmp);
+		}
+
+		for(int a=0; a < len; a++){
+			Cj[nnz] = candidates[a].index;
+			Cx[nnz] = candidates[a].value;
+			nnz++;
+		}
+		candidates.clear();
+
+		Cp[i+1] = nnz;
+	}
 }
 
 /*
-    C++ implementation of sparse_dot_free_source
+	C++ implementation of sparse_dot_free_source
 
-    This function will return a matrix C in CSR format, where
-    C = [all results > lower_bound sorted for each row of A * B].
-    It also returns the maximum number of elements per row of C.
+	This function will return a matrix C in CSR format, where
+	C = [all results > lower_bound sorted for each row of A * B].
+	It also returns the maximum number of elements per row of C.
 
-    Input:
-        n_row: number of rows of A matrix
-        n_col: number of columns of B matrix
+	Input:
+		n_row: number of rows of A matrix
+		n_col: number of columns of B matrix
 
-        Ap, Aj, Ax: CSR expression of A matrix
-        Bp, Bj, Bx: CSR expression of B matrix
+		Ap, Aj, Ax: CSR expression of A matrix
+		Bp, Bj, Bx: CSR expression of B matrix
 
-        memory_bound: the maximum number of elements per row of C
-        lower_bound: a threshold that the element of A*B must greater than
+		memory_bound: the maximum number of elements per row of C
+		lower_bound: a threshold that the element of A*B must greater than
 
-    Output by reference:
-        Cp: C array for idx_pointer of CSR expression of C matrix
-        Cj: STL vector for indices of CSR expression of C matrix
-        Cx: STL vector for data values of CSR expression of C matrix
-        n_minmax: the maximum number of elements per row of C
+	Output by reference:
+		Cp: C array for idx_pointer of CSR expression of C matrix
+		Cj: STL vector for indices of CSR expression of C matrix
+		Cx: STL vector for data values of CSR expression of C matrix
+		n_minmax: the maximum number of elements per row of C
 
-    N.B. A and B must be CSR format!!!
+	N.B. A and B must be CSR format!!!
 */
 void sparse_dot_free_source(
 		int n_row,
@@ -292,92 +292,94 @@ void sparse_dot_free_source(
 	Cj->reserve(sz);
 	Cx->reserve(sz);
 
-    std::vector<int> next(n_col,-1);
-    std::vector<double> sums(n_col, 0);
+	std::vector<int> next(n_col,-1);
+	std::vector<double> sums(n_col, 0);
 
-    std::vector<candidate> candidates;
+	std::vector<candidate> candidates;
 
-    Cp[0] = 0;
+	Cp[0] = 0;
 
-    for(int i = 0; i < n_row; i++){
-        int head   = -2;
-        int length =  0;
+	for(int i = 0; i < n_row; i++){
+		int head   = -2;
+		int length =  0;
 
-        int jj_start = Ap[i];
-        int jj_end   = Ap[i+1];
-        for(int jj = jj_start; jj < jj_end; jj++){
-            int j = Aj[jj];
-            double v = Ax[jj]; //value of A in (i,j)
+		int jj_start = Ap[i];
+		int jj_end   = Ap[i+1];
+		for(int jj = jj_start; jj < jj_end; jj++){
+			int j = Aj[jj];
+			double v = Ax[jj]; //value of A in (i,j)
 
-            int kk_start = Bp[j];
-            int kk_end   = Bp[j+1];
-            for(int kk = kk_start; kk < kk_end; kk++){
-                int k = Bj[kk]; //kth column of B in row j
+			int kk_start = Bp[j];
+			int kk_end   = Bp[j+1];
+			for(int kk = kk_start; kk < kk_end; kk++){
+				int k = Bj[kk]; //kth column of B in row j
 
-                sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
+				sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
 
-                if(next[k] == -1){
-                    next[k] = head; //keep a linked list, every element points to the next column index
-                    head  = k;
-                    length++;
-                }
-            }
-        }
+				if(next[k] == -1){
+					next[k] = head; //keep a linked list, every element points to the next column index
+					head  = k;
+					length++;
+				}
+			}
+		}
 
-        for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
+		for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
 
-            if(sums[head] > lower_bound){ //append the nonzero elements
-                candidate c;
-                c.index = head;
-                c.value = sums[head];
-                candidates.push_back(c);
-            }
+			if(sums[head] > lower_bound){ //append the nonzero elements
+				candidate c;
+				c.index = head;
+				c.value = sums[head];
+				candidates.push_back(c);
+			}
 
-            int temp = head;
-            head = next[head]; //iterate over columns
+			int temp = head;
+			head = next[head]; //iterate over columns
 
-            next[temp] = -1; //clear arrays
-            sums[temp] =  0; //clear arrays
-        }
+			next[temp] = -1; //clear arrays
+			sums[temp] =  0; //clear arrays
+		}
 
-        int len = (int)candidates.size();
-        *n_minmax = (len > *n_minmax)? len : *n_minmax;
-        std::sort(candidates.begin(), candidates.end(), candidate_cmp);
+		int len = (int)candidates.size();
+		*n_minmax = (len > *n_minmax)? len : *n_minmax;
+		std::sort(candidates.begin(), candidates.end(), candidate_cmp);
 
-        for(int a=0; a < len; a++){
-            Cj->push_back(candidates[a].index);
-            Cx->push_back(candidates[a].value);
-        }
-        candidates.clear();
+		for(int a=0; a < len; a++){
+			Cj->push_back(candidates[a].index);
+			Cx->push_back(candidates[a].value);
+		}
+		candidates.clear();
 
-        Cp[i+1] = Cj->size();
-    }
+		Cp[i+1] = (int) (Cj->size());
+	}
+	Cj->shrink_to_fit();
+	Cx->shrink_to_fit();
 }
 
 /*
-    C++ implementation of sparse_dot_nnz_source
+	C++ implementation of sparse_dot_nnz_source
 
-    This function will return the number nnz of nonzero elements
-    of the matrix C in CSR format, where
-    C = [all results > lower_bound sorted for each row of A * B]
-    and ntop the maximum number of elements per row of C.
-    This function is designed primarily to help with memory management for
-    very large sparse matrices.
+	This function will return the number nnz of nonzero elements
+	of the matrix C in CSR format, where
+	C = [all results > lower_bound sorted for each row of A * B]
+	and ntop the maximum number of elements per row of C.
+	This function is designed primarily to help with memory management for
+	very large sparse matrices.
 
-    Input:
-        n_row: number of rows of A matrix
-        n_col: number of columns of B matrix
+	Input:
+		n_row: number of rows of A matrix
+		n_col: number of columns of B matrix
 
-        Ap, Aj, Ax: CSR expression of A matrix
-        Bp, Bj, Bx: CSR expression of B matrix
+		Ap, Aj, Ax: CSR expression of A matrix
+		Bp, Bj, Bx: CSR expression of B matrix
 
-        lower_bound: a threshold that the element of A*B must greater than
+		lower_bound: a threshold that the element of A*B must greater than
 
-    Output:
-        nnz: number of nonzero elements of matrix C
-        ntop: maximum number of elements per row of C
+	Output:
+		nnz: number of nonzero elements of matrix C
+		ntop: maximum number of elements per row of C
 
-    N.B. A and B must be CSR format!!!
+	N.B. A and B must be CSR format!!!
 */
 void sparse_dot_nnz_source(
 		int n_row,
@@ -393,71 +395,71 @@ void sparse_dot_nnz_source(
 		int* ntop
 )
 {
-    std::vector<int> next(n_col,-1);
-    std::vector<double> sums(n_col, 0);
-
-    *nnz = 0;
-    *ntop = 0;
-
-    for(int i = 0; i < n_row; i++){
-        int head   = -2;
-        int length =  0;
-
-        int jj_start = Ap[i];
-        int jj_end   = Ap[i+1];
-        for(int jj = jj_start; jj < jj_end; jj++){
-            int j = Aj[jj];
-            double v = Ax[jj]; //value of A in (i,j)
-
-            int kk_start = Bp[j];
-            int kk_end   = Bp[j+1];
-            for(int kk = kk_start; kk < kk_end; kk++){
-                int k = Bj[kk]; //kth column of B in row j
-
-                sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
-
-                if(next[k] == -1){
-                    next[k] = head; //keep a linked list, every element points to the next column index
-                    head  = k;
-                    length++;
-                }
-            }
-        }
-
-        int nnz_k = 0;
-        for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
-
-            if(sums[head] > lower_bound) nnz_k++; //count this nonzero element in
-
-            int temp = head;
-            head = next[head]; //iterate over columns
-
-            next[temp] = -1; //clear arrays
-            sums[temp] =  0; //clear arrays
-        }
-        *ntop = (nnz_k > *ntop)? nnz_k : *ntop;
-        *nnz += nnz_k;
-    }
+	std::vector<int> next(n_col,-1);
+	std::vector<double> sums(n_col, 0);
+
+	*nnz = 0;
+	*ntop = 0;
+
+	for(int i = 0; i < n_row; i++){
+		int head   = -2;
+		int length =  0;
+
+		int jj_start = Ap[i];
+		int jj_end   = Ap[i+1];
+		for(int jj = jj_start; jj < jj_end; jj++){
+			int j = Aj[jj];
+			double v = Ax[jj]; //value of A in (i,j)
+
+			int kk_start = Bp[j];
+			int kk_end   = Bp[j+1];
+			for(int kk = kk_start; kk < kk_end; kk++){
+				int k = Bj[kk]; //kth column of B in row j
+
+				sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
+
+				if(next[k] == -1){
+					next[k] = head; //keep a linked list, every element points to the next column index
+					head  = k;
+					length++;
+				}
+			}
+		}
+
+		int nnz_k = 0;
+		for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
+
+			if(sums[head] > lower_bound) nnz_k++; //count this nonzero element in
+
+			int temp = head;
+			head = next[head]; //iterate over columns
+
+			next[temp] = -1; //clear arrays
+			sums[temp] =  0; //clear arrays
+		}
+		*ntop = (nnz_k > *ntop)? nnz_k : *ntop;
+		*nnz += nnz_k;
+	}
 }
 
 /*
-    C++ implementation of sparse_dot_only_max_nnz_col_source
+	C++ implementation of sparse_dot_only_max_nnz_col_source
 
-    This function will return the maximum number of columns set
-    per row over all rows of A * B
+	This function will return the maximum number of columns set
+	per row over all rows of A * B
 
-    Input:
-        n_row: number of rows of A matrix
-        n_col: number of columns of B matrix
+	Input:
+		n_row: number of rows of A matrix
+		n_col: number of columns of B matrix
 
-        Ap, Aj, Ax: CSR expression of A matrix
-        Bp, Bj, Bx: CSR expression of B matrix
+		Ap, Aj, Ax: CSR expression of A matrix
+		Bp, Bj, Bx: CSR expression of B matrix
 
-    Output by reference:
-        max_nnz_col: the maximum number of columns set per row
-                     over all rows of A * B
+	Output by reference:
+		max_nnz_col: the maximum number of columns set per row
+					 over all rows of A * B
 
-    N.B. A and B must be CSR format!!!
+	N.B. A and B must be CSR format!!!
 */
 void sparse_dot_only_max_nnz_col_source(
 		int n_row,
@@ -469,29 +471,29 @@ void sparse_dot_only_max_nnz_col_source(
 		int *max_nnz_col
 )
 {
-    std::vector<bool> unmarked(n_col, true);
-
-    *max_nnz_col = 0;
-
-    for(int i = 0; i < n_row; i++){
-        int length =  0;
-
-        int jj_start = Ap[i];
-        int jj_end   = Ap[i+1];
-        for(int jj = jj_start; jj < jj_end; jj++){
-            int j = Aj[jj];
-
-            int kk_start = Bp[j];
-            int kk_end   = Bp[j+1];
-            for(int kk = kk_start; kk < kk_end; kk++){
-                int k = Bj[kk];	// kth column of B in row j
-
-                if(unmarked[k]){	// if this k is not already marked then ...
-                	unmarked[k] = false;	// keep a record of column k
-                    length++;
-                }
-            }
-        }
-        *max_nnz_col = (length > *max_nnz_col)? length : *max_nnz_col;
-    }
+	std::vector<bool> unmarked(n_col, true);
+
+	*max_nnz_col = 0;
+
+	for(int i = 0; i < n_row; i++){
+		int length =  0;
+
+		int jj_start = Ap[i];
+		int jj_end   = Ap[i+1];
+		for(int jj = jj_start; jj < jj_end; jj++){
+			int j = Aj[jj];
+
+			int kk_start = Bp[j];
+			int kk_end   = Bp[j+1];
+			for(int kk = kk_start; kk < kk_end; kk++){
+				int k = Bj[kk];	// kth column of B in row j
+
+				if(unmarked[k]){	// if this k is not already marked then ...
+					unmarked[k] = false;	// keep a record of column k
+					length++;
+				}
+			}
+		}
+		*max_nnz_col = (length > *max_nnz_col)? length : *max_nnz_col;
+	}
 }
diff --git a/sparse_dot_topn/sparse_dot_topn_threaded.pyx b/sparse_dot_topn/sparse_dot_topn_threaded.pyx
index 86c347ec..8b003f08 100644
--- a/sparse_dot_topn/sparse_dot_topn_threaded.pyx
+++ b/sparse_dot_topn/sparse_dot_topn_threaded.pyx
@@ -5,7 +5,7 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at#
-#    http://www.apache.org/licenses/LICENSE-2.0#
+#	http://www.apache.org/licenses/LICENSE-2.0#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -31,183 +31,183 @@ np.import_array()
 
 cdef extern from "sparse_dot_topn_parallel.h":
 
-    cdef void sparse_dot_topn_parallel(
-                                        int n_row,
-                                        int n_col,
-                                        int Ap[],
-                                        int Aj[],
-                                        double Ax[],
-                                        int Bp[],
-                                        int Bj[],
-                                        double Bx[],
-                                        int topn,
-                                        double lower_bound,
-                                        int Cp[],
-                                        int Cj[],
-                                        double Cx[],
-                                        int n_jobs
-                                    );
-
-    cdef void sparse_dot_topn_extd_parallel(
-                                                int n_row,
-                                                int n_col,
-                                                int Ap[],
-                                                int Aj[],
-                                                double Ax[],
-                                                int Bp[],
-                                                int Bj[],
-                                                double Bx[],
-                                                int topn,
-                                                double lower_bound,
-                                                int Cp[],
-                                                int Cj[],
-                                                double Cx[],
-                                                int* n_minmax,
-                                                int n_jobs
-                                            );
-
-    cdef void sparse_dot_free_parallel(
-                                        int n_row,
-                                        int n_col,
-                                        int Ap[],
-                                        int Aj[],
-                                        double Ax[],
-                                        int Bp[],
-                                        int Bj[],
-                                        double Bx[],
-                                        double lower_bound,
-                                        int Cp[],
-                                        vector[int]* Cj,
-                                        vector[double]* Cx,
-                                        int* n_minmax,
-                                        int n_jobs
-                                    );
-
-    cdef void sparse_dot_only_max_nnz_col_parallel(
-                                                    int n_row,
-                                                    int n_col,
-                                                    int Ap[],
-                                                    int Aj[],
-                                                    int Bp[],
-                                                    int Bj[],
-                                                    int* max_nnz_col,
-                                                    int n_jobs
-                                                );
+	cdef void sparse_dot_topn_parallel(
+										int n_row,
+										int n_col,
+										int Ap[],
+										int Aj[],
+										double Ax[],
+										int Bp[],
+										int Bj[],
+										double Bx[],
+										int topn,
+										double lower_bound,
+										int Cp[],
+										int Cj[],
+										double Cx[],
+										int n_jobs
+									);
+
+	cdef void sparse_dot_topn_extd_parallel(
+												int n_row,
+												int n_col,
+												int Ap[],
+												int Aj[],
+												double Ax[],
+												int Bp[],
+												int Bj[],
+												double Bx[],
+												int topn,
+												double lower_bound,
+												int Cp[],
+												int Cj[],
+												double Cx[],
+												int* n_minmax,
+												int n_jobs
+											);
+
+	cdef void sparse_dot_free_parallel(
+										int n_row,
+										int n_col,
+										int Ap[],
+										int Aj[],
+										double Ax[],
+										int Bp[],
+										int Bj[],
+										double Bx[],
+										double lower_bound,
+										int Cp[],
+										vector[int]* Cj,
+										vector[double]* Cx,
+										int* n_minmax,
+										int n_jobs
+									);
+
+	cdef void sparse_dot_only_max_nnz_col_parallel(
+													int n_row,
+													int n_col,
+													int Ap[],
+													int Aj[],
+													int Bp[],
+													int Bj[],
+													int* max_nnz_col,
+													int n_jobs
+												);
 
 cpdef sparse_dot_topn_threaded(
-                                int n_row,
-                                int n_col,
-                                np.ndarray[int, ndim=1] a_indptr,
-                                np.ndarray[int, ndim=1] a_indices,
-                                np.ndarray[double, ndim=1] a_data,
-                                np.ndarray[int, ndim=1] b_indptr,
-                                np.ndarray[int, ndim=1] b_indices,
-                                np.ndarray[double, ndim=1] b_data,
-                                int ntop,
-                                double lower_bound,
-                                np.ndarray[int, ndim=1] c_indptr,
-                                np.ndarray[int, ndim=1] c_indices,
-                                np.ndarray[double, ndim=1] c_data,
-                                int n_jobs
-                            ):
-
-    cdef int* Ap = &a_indptr[0]
-    cdef int* Aj = &a_indices[0]
-    cdef double* Ax = &a_data[0]
-    cdef int* Bp = &b_indptr[0]
-    cdef int* Bj = &b_indices[0]
-    cdef double* Bx = &b_data[0]
-    cdef int* Cp = &c_indptr[0]
-    cdef int* Cj = &c_indices[0]
-    cdef double* Cx = &c_data[0]
-
-    sparse_dot_topn_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop,
-                             lower_bound, Cp, Cj, Cx, n_jobs)
-    return
+								int n_row,
+								int n_col,
+								np.ndarray[int, ndim=1] a_indptr,
+								np.ndarray[int, ndim=1] a_indices,
+								np.ndarray[double, ndim=1] a_data,
+								np.ndarray[int, ndim=1] b_indptr,
+								np.ndarray[int, ndim=1] b_indices,
+								np.ndarray[double, ndim=1] b_data,
+								int ntop,
+								double lower_bound,
+								np.ndarray[int, ndim=1] c_indptr,
+								np.ndarray[int, ndim=1] c_indices,
+								np.ndarray[double, ndim=1] c_data,
+								int n_jobs
+							):
+
+	cdef int* Ap = &a_indptr[0]
+	cdef int* Aj = &a_indices[0]
+	cdef double* Ax = &a_data[0]
+	cdef int* Bp = &b_indptr[0]
+	cdef int* Bj = &b_indices[0]
+	cdef double* Bx = &b_data[0]
+	cdef int* Cp = &c_indptr[0]
+	cdef int* Cj = &c_indices[0]
+	cdef double* Cx = &c_data[0]
+
+	sparse_dot_topn_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop,
+							 lower_bound, Cp, Cj, Cx, n_jobs)
+	return
 
 cpdef sparse_dot_topn_extd_threaded(
-                                int n_row,
-                                int n_col,
-                                np.ndarray[int, ndim=1] a_indptr,
-                                np.ndarray[int, ndim=1] a_indices,
-                                np.ndarray[double, ndim=1] a_data,
-                                np.ndarray[int, ndim=1] b_indptr,
-                                np.ndarray[int, ndim=1] b_indices,
-                                np.ndarray[double, ndim=1] b_data,
-                                int ntop,
-                                double lower_bound,
-                                np.ndarray[int, ndim=1] c_indptr,
-                                np.ndarray[int, ndim=1] c_indices,
-                                np.ndarray[double, ndim=1] c_data,
-                                np.ndarray[int, ndim=1] nminmax,
-                                int n_jobs
-                            ):
-
-    cdef int* Ap = &a_indptr[0]
-    cdef int* Aj = &a_indices[0]
-    cdef double* Ax = &a_data[0]
-    cdef int* Bp = &b_indptr[0]
-    cdef int* Bj = &b_indices[0]
-    cdef double* Bx = &b_data[0]
-    cdef int* Cp = &c_indptr[0]
-    cdef int* Cj = &c_indices[0]
-    cdef double* Cx = &c_data[0]
-    cdef int* n_minmax = &nminmax[0]
-
-    sparse_dot_topn_extd_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop,
-                             lower_bound, Cp, Cj, Cx, n_minmax, n_jobs)
-    return
+								int n_row,
+								int n_col,
+								np.ndarray[int, ndim=1] a_indptr,
+								np.ndarray[int, ndim=1] a_indices,
+								np.ndarray[double, ndim=1] a_data,
+								np.ndarray[int, ndim=1] b_indptr,
+								np.ndarray[int, ndim=1] b_indices,
+								np.ndarray[double, ndim=1] b_data,
+								int ntop,
+								double lower_bound,
+								np.ndarray[int, ndim=1] c_indptr,
+								np.ndarray[int, ndim=1] c_indices,
+								np.ndarray[double, ndim=1] c_data,
+								np.ndarray[int, ndim=1] nminmax,
+								int n_jobs
+							):
+
+	cdef int* Ap = &a_indptr[0]
+	cdef int* Aj = &a_indices[0]
+	cdef double* Ax = &a_data[0]
+	cdef int* Bp = &b_indptr[0]
+	cdef int* Bj = &b_indices[0]
+	cdef double* Bx = &b_data[0]
+	cdef int* Cp = &c_indptr[0]
+	cdef int* Cj = &c_indices[0]
+	cdef double* Cx = &c_data[0]
+	cdef int* n_minmax = &nminmax[0]
+
+	sparse_dot_topn_extd_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop,
+							 lower_bound, Cp, Cj, Cx, n_minmax, n_jobs)
+	return
 
 cpdef sparse_dot_free_threaded(
-                                int n_row,
-                                int n_col,
-                                np.ndarray[int, ndim=1] a_indptr,
-                                np.ndarray[int, ndim=1] a_indices,
-                                np.ndarray[double, ndim=1] a_data,
-                                np.ndarray[int, ndim=1] b_indptr,
-                                np.ndarray[int, ndim=1] b_indices,
-                                np.ndarray[double, ndim=1] b_data,
-                                double lower_bound,
-                                np.ndarray[int, ndim=1] c_indptr,
-                                int n_jobs
-                            ):
-
-    cdef int* Ap = &a_indptr[0]
-    cdef int* Aj = &a_indices[0]
-    cdef double* Ax = &a_data[0]
-    cdef int* Bp = &b_indptr[0]
-    cdef int* Bj = &b_indices[0]
-    cdef double* Bx = &b_data[0]
-    cdef int* Cp = &c_indptr[0]
-    cdef np.ndarray[int, ndim=1] nminmax = np.array([0], dtype=np.int32)
-    cdef int* n_minmax = &nminmax[0]
-
-    cdef vector[int] vCj;
-    cdef vector[double] vCx;
-
-    sparse_dot_free_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx, n_minmax, n_jobs)
-    
-    c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0)
-    c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0)
-    
-    return c_indices, c_data, nminmax[0]
+								int n_row,
+								int n_col,
+								np.ndarray[int, ndim=1] a_indptr,
+								np.ndarray[int, ndim=1] a_indices,
+								np.ndarray[double, ndim=1] a_data,
+								np.ndarray[int, ndim=1] b_indptr,
+								np.ndarray[int, ndim=1] b_indices,
+								np.ndarray[double, ndim=1] b_data,
+								double lower_bound,
+								np.ndarray[int, ndim=1] c_indptr,
+								int n_jobs
+							):
+
+	cdef int* Ap = &a_indptr[0]
+	cdef int* Aj = &a_indices[0]
+	cdef double* Ax = &a_data[0]
+	cdef int* Bp = &b_indptr[0]
+	cdef int* Bj = &b_indices[0]
+	cdef double* Bx = &b_data[0]
+	cdef int* Cp = &c_indptr[0]
+	cdef np.ndarray[int, ndim=1] nminmax = np.array([0], dtype=np.int32)
+	cdef int* n_minmax = &nminmax[0]
+
+	cdef vector[int] vCj;
+	cdef vector[double] vCx;
+
+	sparse_dot_free_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx, n_minmax, n_jobs)
+	
+	c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0)
+	c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0)
+	
+	return c_indices, c_data, nminmax[0]
 
 cpdef sparse_dot_only_max_nnz_col_threaded(
-                                            int n_row,
-                                            int n_col,
-                                            np.ndarray[int, ndim=1] a_indptr,
-                                            np.ndarray[int, ndim=1] a_indices,
-                                            np.ndarray[int, ndim=1] b_indptr,
-                                            np.ndarray[int, ndim=1] b_indices,
-                                            np.ndarray[int, ndim=1] max_nnz_col,
-                                            int n_jobs
-                                        ):
-
-    cdef int* Ap = &a_indptr[0]
-    cdef int* Aj = &a_indices[0]
-    cdef int* Bp = &b_indptr[0]
-    cdef int* Bj = &b_indices[0]
-    cdef int* o_max_nnz_col = &max_nnz_col[0]
-
-    sparse_dot_only_max_nnz_col_parallel(n_row, n_col, Ap, Aj, Bp, Bj, o_max_nnz_col, n_jobs)
-    return
+											int n_row,
+											int n_col,
+											np.ndarray[int, ndim=1] a_indptr,
+											np.ndarray[int, ndim=1] a_indices,
+											np.ndarray[int, ndim=1] b_indptr,
+											np.ndarray[int, ndim=1] b_indices,
+											np.ndarray[int, ndim=1] max_nnz_col,
+											int n_jobs
+										):
+
+	cdef int* Ap = &a_indptr[0]
+	cdef int* Aj = &a_indices[0]
+	cdef int* Bp = &b_indptr[0]
+	cdef int* Bj = &b_indices[0]
+	cdef int* o_max_nnz_col = &max_nnz_col[0]
+
+	sparse_dot_only_max_nnz_col_parallel(n_row, n_col, Ap, Aj, Bp, Bj, o_max_nnz_col, n_jobs)
+	return
diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index 1ea3b1a9..33bfb0e6 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -135,11 +135,11 @@ def match_strings(master: pd.Series,
 
 
 class StringGrouperConfig(NamedTuple):
-    """
+    r"""
     Class with configuration variables.
 
     :param ngram_size: int. The amount of characters in each n-gram. Default is 3.
-    :param regex: str. The regex string used to cleanup the input string. Default is [,-./]|\s.
+    :param regex: str. The regex string used to cleanup the input string. Default is '[,-./]|\s'.
     :param max_n_matches: int. The maximum number of matches allowed per string. Default is 20.
     :param min_similarity: float. The minimum cosine similarity for two strings to be considered a match.
     Defaults to 0.8.

From 30712de5d8c167d36bccf5580ee8d9f6ded7ee94 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Sat, 24 Apr 2021 20:27:12 +0200
Subject: [PATCH 09/29] made ntop always flexible (i.e., not only when ntop >=
 B.shape[1])

---
 sparse_dot_topn/awesome_cossim_topn.py        | 441 ++++++++-------
 sparse_dot_topn/sparse_dot_topn.pyx           | 515 +++++++++---------
 sparse_dot_topn/sparse_dot_topn_parallel.cpp  |  27 +-
 sparse_dot_topn/sparse_dot_topn_parallel.h    |   1 +
 sparse_dot_topn/sparse_dot_topn_source.cpp    |  20 +-
 sparse_dot_topn/sparse_dot_topn_source.h      |   1 +
 sparse_dot_topn/sparse_dot_topn_threaded.pyx  |   4 +-
 .../test/test_awesome_cossim_topn.py          |  44 +-
 string_grouper/string_grouper.py              |   9 +-
 9 files changed, 559 insertions(+), 503 deletions(-)

diff --git a/sparse_dot_topn/awesome_cossim_topn.py b/sparse_dot_topn/awesome_cossim_topn.py
index efce38bd..48caaa57 100644
--- a/sparse_dot_topn/awesome_cossim_topn.py
+++ b/sparse_dot_topn/awesome_cossim_topn.py
@@ -4,238 +4,223 @@
 from scipy.sparse import isspmatrix_csr
 
 if sys.version_info[0] >= 3:
-	from sparse_dot_topn import sparse_dot_topn as ct
-	from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread
+    from sparse_dot_topn import sparse_dot_topn as ct
+    from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread
 else:
-	import sparse_dot_topn as ct
-	import sparse_dot_topn_threaded as ct_thread
+    import sparse_dot_topn as ct
+    import sparse_dot_topn_threaded as ct_thread
 
 
 def awesome_cossim_topn(
-		A,
-		B,
-		ntop,
-		lower_bound=0,
-		use_threads=False,
-		n_jobs=1,
-		ntop_is_flexible=False,
-		mem_manager_is_C=False,
-		return_best_topn=False
-	):
-	"""
-	This function will return a matrix C in CSR format, where
-	C = [sorted top n results > lower_bound for each row of A * B].
-	If return_best_topn=True then best_topn
-	(the true maximum number of elements > lower_bound per row of A * B)
-	will also be returned in a tuple together with C as (C, best_topn).
-
-	Input:
-		A and B: two CSR matrices
-		ntop: top n results
-		lower_bound: a threshold that the element of A*B must be greater than
-		use_threads: use multi-thread or not
-		n_jobs: number of thread, must be >= 1
-		ntop_is_flexible: (default: False) if True, memory management will be handed 
-						  over to C/C++ whenever python's attempt at allocating
-						  memory fails.
-		mem_manager_is_C: (default: False) this is mainly for testing purposes. if 
-						  True, will force memory management to be handed over to
-						  C/C++. Should be used only when ntop >= number of columns 
-						  of B or ntop_is_flexible=True.
-		return_best_topn: (default: False) if True, will return best_topn together 
-						  with C as a tuple: (C, best_topn)
-
-	Output:
-		C: result matrix (returned alone, if return_best_topn=False)
-		best_topn: The true maximum number of elements > lower_bound per row of 
-				   A * B returned together with C as a tuple: (C, best_topn). It is 
-				   returned only if return_best_topn=True.
-
-	N.B. if A and B are not in CSR format, they will be converted to CSR
-	"""
-	if not isspmatrix_csr(A):
-		A = A.tocsr()
-
-	if not isspmatrix_csr(B):
-		B = B.tocsr()
-
-	M, K1 = A.shape
-	K2, N = B.shape
-
-	if K1 != K2:
-		err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!'
-		raise ValueError(err_str)
-
-	idx_dtype = np.int32
-
-	nnz_max = M*ntop
-
-	# basic check. if A or B are all zeros matrix, return all zero matrix directly
-	if len(A.indices) == 0 or len(B.indices) == 0:
-		indptr = np.zeros(M + 1, dtype=idx_dtype)
-		indices = np.zeros(nnz_max, dtype=idx_dtype)
-		data = np.zeros(nnz_max, dtype=A.dtype)
-		output = csr_matrix((data, indices, indptr), shape=(M, N))
-		if return_best_topn:
-			return output, 0
-		else:
-			return output
-
-	# filled matrices from here on
-	indptr = np.empty(M + 1, dtype=idx_dtype)
-	try:
-		indices = np.empty(nnz_max, dtype=idx_dtype)
-		data = np.empty(nnz_max, dtype=A.dtype)
-		
-		if mem_manager_is_C: raise MemoryError	# This is mainly for testing purposes
-		
-	except MemoryError:
-		# if mem_manager_is_C: print('Exception raised! Continuing ...', flush=True)
-		if ntop_is_flexible or ntop >= N:
-		# It is likely you are here because nnz_max is too large. But don't give up just yet! 
-		# sparse_dot_topn will hand over the memory allocation/management to C++.  C++ will
-		# grow the memory allocations for these arrays as needed without any need for nnz_max.
-		# Note that reallocations could occur causing data to be copied to other locations 
-		# in memory thus impacting performance
-			indices = np.empty(0, dtype=idx_dtype)
-			data = np.empty(0, dtype=A.dtype)
-			if not use_threads:
-	
-				indices, data, best_topn = ct.sparse_dot_free(
-					M, N, np.asarray(A.indptr, dtype=idx_dtype),
-					np.asarray(A.indices, dtype=idx_dtype),
-					A.data,
-					np.asarray(B.indptr, dtype=idx_dtype),
-					np.asarray(B.indices, dtype=idx_dtype),
-					B.data,
-					lower_bound,
-					indptr
-				)
-			else:
-	
-				indices, data, best_topn = ct_thread.sparse_dot_free_threaded(
-					M, N, np.asarray(A.indptr, dtype=idx_dtype),
-					np.asarray(A.indices, dtype=idx_dtype),
-					A.data,
-					np.asarray(B.indptr, dtype=idx_dtype),
-					np.asarray(B.indices, dtype=idx_dtype),
-					B.data,
-					lower_bound,
-					indptr, n_jobs
-				)
-		else:
-
-			if mem_manager_is_C:
-				raise Exception(
-					'When mem_manager_is_C=True, set ntop >= B.shape[1], or set ntop_is_flexible=True'
-				)
-			else:
-				raise Exception(
-					'Not enough memory!  Data array is too large. Try reducing the value of ntop.'
-					'or set ntop_is_flexible=True'
-				)
-	else:
-		# no exception was raised; then use old function (as it is expected to be the fastest)
-		
-		best_topn_arr = np.full(1, 0, dtype=idx_dtype)
-		
-		if not use_threads:
-		
-			ct.sparse_dot_topn_extd(
-				M, N, np.asarray(A.indptr, dtype=idx_dtype),
-				np.asarray(A.indices, dtype=idx_dtype),
-				A.data,
-				np.asarray(B.indptr, dtype=idx_dtype),
-				np.asarray(B.indices, dtype=idx_dtype),
-				B.data,
-				ntop,
-				lower_bound,
-				indptr, indices, data, best_topn_arr
-			)
-		else:
-			if n_jobs < 1:
-				err_str = 'Whenever you select the multi-thread mode, n_job must be greater than or equal to 1!'
-				raise ValueError(err_str)
-	
-			ct_thread.sparse_dot_topn_extd_threaded(
-				M, N, np.asarray(A.indptr, dtype=idx_dtype),
-				np.asarray(A.indices, dtype=idx_dtype),
-				A.data,
-				np.asarray(B.indptr, dtype=idx_dtype),
-				np.asarray(B.indices, dtype=idx_dtype),
-				B.data,
-				ntop,
-				lower_bound,
-				indptr, indices, data, best_topn_arr, n_jobs
-			)
-		best_topn = best_topn_arr[0]
-	
-	# prepare and return the output:
-	output = csr_matrix((data, indices, indptr), shape=(M, N))
-	if return_best_topn:
-		return output, best_topn
-	else:
-		return output
+        A,
+        B,
+        ntop,
+        lower_bound=0,
+        use_threads=False,
+        n_jobs=1,
+        mem_manager_is_C=False,
+        return_best_topn=False
+    ):
+    """
+    This function will return a matrix C in CSR format, where
+    C = [sorted top n results > lower_bound for each row of A * B].
+    If return_best_topn=True then best_topn
+    (the true maximum number of elements > lower_bound per row of A * B)
+    will also be returned in a tuple together with C as (C, best_topn).
+
+    Input:
+        A and B: two CSR matrices
+        ntop: top n results
+        lower_bound: a threshold that the element of A*B must be greater than
+        use_threads: use multi-thread or not
+        n_jobs: number of thread, must be >= 1
+        mem_manager_is_C: (default: False) this is mainly for testing purposes. if 
+                          True, will force memory management to be handed over to
+                          C/C++.
+        return_best_topn: (default: False) if True, will return best_topn together 
+                          with C as a tuple: (C, best_topn)
+
+    Output:
+        C: result matrix (returned alone, if return_best_topn=False)
+        best_topn: The true maximum number of elements > lower_bound per row of 
+                   A * B returned together with C as a tuple: (C, best_topn). It is 
+                   returned only if return_best_topn=True.
+
+    N.B. if A and B are not in CSR format, they will be converted to CSR
+    """
+    if not isspmatrix_csr(A):
+        A = A.tocsr()
+
+    if not isspmatrix_csr(B):
+        B = B.tocsr()
+
+    M, K1 = A.shape
+    K2, N = B.shape
+
+    if K1 != K2:
+        err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!'
+        raise ValueError(err_str)
+
+    idx_dtype = np.int32
+
+    nnz_max = M*ntop
+
+    # basic check. if A or B are all zeros matrix, return all zero matrix directly
+    if len(A.indices) == 0 or len(B.indices) == 0:
+        indptr = np.zeros(M + 1, dtype=idx_dtype)
+        indices = np.zeros(nnz_max, dtype=idx_dtype)
+        data = np.zeros(nnz_max, dtype=A.dtype)
+        output = csr_matrix((data, indices, indptr), shape=(M, N))
+        if return_best_topn:
+            return output, 0
+        else:
+            return output
+
+    # filled matrices from here on
+    indptr = np.empty(M+1, dtype=idx_dtype)
+    try:
+        indices = np.empty(nnz_max, dtype=idx_dtype)
+        data = np.empty(nnz_max, dtype=A.dtype)
+        if mem_manager_is_C: raise MemoryError    # This is mainly for testing purposes
+    except MemoryError:
+        # if mem_manager_is_C: print('Exception raised! Continuing ...', flush=True)
+        # It is likely you are here because nnz_max is too large. But don't give up just yet! 
+        # sparse_dot_topn will hand over the memory allocation/management to C++.  C++ will
+        # grow the memory allocations for these arrays as needed without any need for nnz_max.
+        # Note that reallocations could occur causing data to be copied to other locations 
+        # in memory thus impacting performance
+        indices = np.empty(0, dtype=idx_dtype)
+        data = np.empty(0, dtype=A.dtype)
+        if not use_threads:
+
+            indices, data, best_topn = ct.sparse_dot_free(
+                M, N, np.asarray(A.indptr, dtype=idx_dtype),
+                np.asarray(A.indices, dtype=idx_dtype),
+                A.data,
+                np.asarray(B.indptr, dtype=idx_dtype),
+                np.asarray(B.indices, dtype=idx_dtype),
+                B.data,
+                ntop, lower_bound,
+                indptr
+            )
+            
+        else:
+
+            indices, data, best_topn = ct_thread.sparse_dot_free_threaded(
+                M, N, np.asarray(A.indptr, dtype=idx_dtype),
+                np.asarray(A.indices, dtype=idx_dtype),
+                A.data,
+                np.asarray(B.indptr, dtype=idx_dtype),
+                np.asarray(B.indices, dtype=idx_dtype),
+                B.data,
+                ntop, lower_bound,
+                indptr, n_jobs
+            )
+
+    else:
+        # no exception was raised; then use old function (as it is expected to be the fastest)
+        
+        best_topn_arr = np.full(1, 0, dtype=idx_dtype)
+        
+        if not use_threads:
+        
+            ct.sparse_dot_topn_extd(
+                M, N, np.asarray(A.indptr, dtype=idx_dtype),
+                np.asarray(A.indices, dtype=idx_dtype),
+                A.data,
+                np.asarray(B.indptr, dtype=idx_dtype),
+                np.asarray(B.indices, dtype=idx_dtype),
+                B.data,
+                ntop,
+                lower_bound,
+                indptr, indices, data, best_topn_arr
+            )
+    
+        else:
+            if n_jobs < 1:
+                err_str = 'Whenever you select the multi-thread mode, n_job must be greater than or equal to 1!'
+                raise ValueError(err_str)
+    
+            ct_thread.sparse_dot_topn_extd_threaded(
+                M, N, np.asarray(A.indptr, dtype=idx_dtype),
+                np.asarray(A.indices, dtype=idx_dtype),
+                A.data,
+                np.asarray(B.indptr, dtype=idx_dtype),
+                np.asarray(B.indices, dtype=idx_dtype),
+                B.data,
+                ntop,
+                lower_bound,
+                indptr, indices, data, best_topn_arr, n_jobs
+            )
+        
+        best_topn = best_topn_arr[0]
+    
+    # prepare and return the output:
+    output = csr_matrix((data, indices, indptr), shape=(M, N))
+    if return_best_topn:
+        return output, best_topn
+    else:
+        return output
 
 
 def awesome_cossim_only_max_nnz_col(A, B, use_threads=False, n_jobs=1):
-	"""
-	This function will return the maximum number of columns set
-	per row over all rows of A * B
-
-	Input:
-		A and B: two CSR matrix
-		use_threads: use multi-thread or not
-		n_jobs: number of thread, must be >= 1
-
-	Output:
-		minmax_topn: maximum number of columns set
-					 per row over all rows of A * B
-
-	N.B. if A and B are not CSR format, they will be converted to CSR
-	"""
-	if not isspmatrix_csr(A):
-		A = A.tocsr()
-
-	if not isspmatrix_csr(B):
-		B = B.tocsr()
-
-	M, K1 = A.shape
-	K2, N = B.shape
-
-	if K1 != K2:
-		err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!'
-		raise ValueError(err_str)
-
-	idx_dtype = np.int32
-
-	minmax_topn = np.full(1, 0, dtype=idx_dtype)
-
-	# basic check. if A or B are all zeros matrix, return 0 directly
-	if len(A.indices) == 0 or len(B.indices) == 0:
-		return 0
-
-	if not use_threads:
-
-		ct.sparse_dot_only_max_nnz_col(
-			M, N,
-			np.asarray(A.indptr, dtype=idx_dtype),
-			np.asarray(A.indices, dtype=idx_dtype),
-			np.asarray(B.indptr, dtype=idx_dtype),
-			np.asarray(B.indices, dtype=idx_dtype),
-			minmax_topn)
-
-	else:
-		if n_jobs < 1:
-			err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!'
-			raise ValueError(err_str)
-
-		ct_thread.sparse_dot_only_max_nnz_col_threaded(
-			M, N,
-			np.asarray(A.indptr, dtype=idx_dtype),
-			np.asarray(A.indices, dtype=idx_dtype),
-			np.asarray(B.indptr, dtype=idx_dtype),
-			np.asarray(B.indices, dtype=idx_dtype),
-			minmax_topn, n_jobs)
-
-	return minmax_topn[0]
+    """
+    This function will return the maximum number of columns set
+    per row over all rows of A * B
+
+    Input:
+        A and B: two CSR matrix
+        use_threads: use multi-thread or not
+        n_jobs: number of thread, must be >= 1
+
+    Output:
+        minmax_topn: maximum number of columns set
+                     per row over all rows of A * B
+
+    N.B. if A and B are not CSR format, they will be converted to CSR
+    """
+    if not isspmatrix_csr(A):
+        A = A.tocsr()
+
+    if not isspmatrix_csr(B):
+        B = B.tocsr()
+
+    M, K1 = A.shape
+    K2, N = B.shape
+
+    if K1 != K2:
+        err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!'
+        raise ValueError(err_str)
+
+    idx_dtype = np.int32
+
+    minmax_topn = np.full(1, 0, dtype=idx_dtype)
+
+    # basic check. if A or B are all zeros matrix, return 0 directly
+    if len(A.indices) == 0 or len(B.indices) == 0:
+        return 0
+
+    if not use_threads:
+
+        ct.sparse_dot_only_max_nnz_col(
+            M, N,
+            np.asarray(A.indptr, dtype=idx_dtype),
+            np.asarray(A.indices, dtype=idx_dtype),
+            np.asarray(B.indptr, dtype=idx_dtype),
+            np.asarray(B.indices, dtype=idx_dtype),
+            minmax_topn)
+
+    else:
+        if n_jobs < 1:
+            err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!'
+            raise ValueError(err_str)
+
+        ct_thread.sparse_dot_only_max_nnz_col_threaded(
+            M, N,
+            np.asarray(A.indptr, dtype=idx_dtype),
+            np.asarray(A.indices, dtype=idx_dtype),
+            np.asarray(B.indptr, dtype=idx_dtype),
+            np.asarray(B.indices, dtype=idx_dtype),
+            minmax_topn, n_jobs)
+
+    return minmax_topn[0]
diff --git a/sparse_dot_topn/sparse_dot_topn.pyx b/sparse_dot_topn/sparse_dot_topn.pyx
index b4e8463d..1d9e751a 100644
--- a/sparse_dot_topn/sparse_dot_topn.pyx
+++ b/sparse_dot_topn/sparse_dot_topn.pyx
@@ -5,7 +5,7 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at#
-#	http://www.apache.org/licenses/LICENSE-2.0#
+#    http://www.apache.org/licenses/LICENSE-2.0#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -30,260 +30,277 @@ np.import_array()
 
 cdef extern from "sparse_dot_topn_source.h":
 
-	cdef void sparse_dot_topn_source(
-										int n_row,
-										int n_col,
-										int Ap[],
-										int Aj[],
-										double Ax[],
-										int Bp[],
-										int Bj[],
-										double Bx[],
-										int topn,
-										double lower_bound,
-										int Cp[],
-										int Cj[],
-										double Cx[]
-									);
-
-	cdef void sparse_dot_topn_extd_source(
-										int n_row,
-										int n_col,
-										int Ap[],
-										int Aj[],
-										double Ax[],
-										int Bp[],
-										int Bj[],
-										double Bx[],
-										int topn,
-										double lower_bound,
-										int Cp[],
-										int Cj[],
-										double Cx[],
-										int* nminmax
-									);
-
-	cdef void sparse_dot_free_source(
-										int n_row,
-										int n_col,
-										int Ap[],
-										int Aj[],
-										double Ax[],
-										int Bp[],
-										int Bj[],
-										double Bx[],
-										double lower_bound,
-										int Cp[],
-										vector[int]* Cj,
-										vector[double]* Cx,
-										int* n_minmax
-									);
-
-	cdef void sparse_dot_only_max_nnz_col_source(
-													int n_row,
-													int n_col,
-													int Ap[],
-													int Aj[],
-													int Bp[],
-													int Bj[],
-													int* max_nnz_col
-												);
+    cdef void sparse_dot_topn_source(
+                                        int n_row,
+                                        int n_col,
+                                        int Ap[],
+                                        int Aj[],
+                                        double Ax[],
+                                        int Bp[],
+                                        int Bj[],
+                                        double Bx[],
+                                        int topn,
+                                        double lower_bound,
+                                        int Cp[],
+                                        int Cj[],
+                                        double Cx[]
+                                    );
+
+    cdef void sparse_dot_topn_extd_source(
+                                        int n_row,
+                                        int n_col,
+                                        int Ap[],
+                                        int Aj[],
+                                        double Ax[],
+                                        int Bp[],
+                                        int Bj[],
+                                        double Bx[],
+                                        int topn,
+                                        double lower_bound,
+                                        int Cp[],
+                                        int Cj[],
+                                        double Cx[],
+                                        int* nminmax
+                                    );
+
+    cdef void sparse_dot_free_source(
+                                        int n_row,
+                                        int n_col,
+                                        int Ap[],
+                                        int Aj[],
+                                        double Ax[],
+                                        int Bp[],
+                                        int Bj[],
+                                        double Bx[],
+                                        int ntop,
+                                        double lower_bound,
+                                        int Cp[],
+                                        vector[int]* Cj,
+                                        vector[double]* Cx,
+                                        int* n_minmax
+                                    );
+
+    cdef void sparse_dot_only_max_nnz_col_source(
+                                                    int n_row,
+                                                    int n_col,
+                                                    int Ap[],
+                                                    int Aj[],
+                                                    int Bp[],
+                                                    int Bj[],
+                                                    int* max_nnz_col
+                                                );
 
 cpdef sparse_dot_topn(
-						int n_row,
-						int n_col,
-						np.ndarray[int, ndim=1] a_indptr,
-						np.ndarray[int, ndim=1] a_indices,
-						np.ndarray[double, ndim=1] a_data,
-						np.ndarray[int, ndim=1] b_indptr,
-						np.ndarray[int, ndim=1] b_indices,
-						np.ndarray[double, ndim=1] b_data,
-						int ntop,
-						double lower_bound,
-						np.ndarray[int, ndim=1] c_indptr,
-						np.ndarray[int, ndim=1] c_indices,
-						np.ndarray[double, ndim=1] c_data
-					):
-	"""
-	Cython glue function to call sparse_dot_topn C++ implementation
-	This function will return a matrix C in CSR format, where
-	C = [sorted top n results and results > lower_bound for each row of A * B]
-
-	Input:
-		n_row: number of rows of A matrix
-		n_col: number of columns of B matrix
-
-		a_indptr, a_indices, a_data: CSR expression of A matrix
-		b_indptr, b_indices, b_data: CSR expression of B matrix
-
-		ntop: n top results
-		lower_bound: a threshold that the element of A*B must greater than
-
-	Output by reference:
-		c_indptr, c_indices, c_data: CSR expression of C matrix
-
-	N.B. A and B must be CSR format!!!
-		 The type of input numpy array must be aligned with types of C++ function arguments!
-	"""
-
-	cdef int* Ap = &a_indptr[0]
-	cdef int* Aj = &a_indices[0]
-	cdef double* Ax = &a_data[0]
-	cdef int* Bp = &b_indptr[0]
-	cdef int* Bj = &b_indices[0]
-	cdef double* Bx = &b_data[0]
-	cdef int* Cp = &c_indptr[0]
-	cdef int* Cj = &c_indices[0]
-	cdef double* Cx = &c_data[0]
-
-	sparse_dot_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx)
-	return
+                        int n_row,
+                        int n_col,
+                        np.ndarray[int, ndim=1] a_indptr,
+                        np.ndarray[int, ndim=1] a_indices,
+                        np.ndarray[double, ndim=1] a_data,
+                        np.ndarray[int, ndim=1] b_indptr,
+                        np.ndarray[int, ndim=1] b_indices,
+                        np.ndarray[double, ndim=1] b_data,
+                        int ntop,
+                        double lower_bound,
+                        np.ndarray[int, ndim=1] c_indptr,
+                        np.ndarray[int, ndim=1] c_indices,
+                        np.ndarray[double, ndim=1] c_data
+                    ):
+    """
+    Cython glue function to call sparse_dot_topn C++ implementation
+    This function will return a matrix C in CSR format, where
+    C = [sorted top n results and results > lower_bound for each row of A * B]
+
+    Input:
+        n_row: number of rows of A matrix
+        n_col: number of columns of B matrix
+
+        a_indptr, a_indices, a_data: CSR expression of A matrix
+        b_indptr, b_indices, b_data: CSR expression of B matrix
+
+        ntop: n top results
+        lower_bound: a threshold that the element of A*B must greater than
+
+    Output by reference:
+        c_indptr, c_indices, c_data: CSR expression of C matrix
+
+    N.B. A and B must be CSR format!!!
+         The type of input numpy array must be aligned with types of C++ function arguments!
+    """
+
+    cdef int* Ap = &a_indptr[0]
+    cdef int* Aj = &a_indices[0]
+    cdef double* Ax = &a_data[0]
+    cdef int* Bp = &b_indptr[0]
+    cdef int* Bj = &b_indices[0]
+    cdef double* Bx = &b_data[0]
+    cdef int* Cp = &c_indptr[0]
+    cdef int* Cj = &c_indices[0]
+    cdef double* Cx = &c_data[0]
+
+    sparse_dot_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx)
+    return
 
 cpdef sparse_dot_topn_extd(
-						int n_row,
-						int n_col,
-						np.ndarray[int, ndim=1] a_indptr,
-						np.ndarray[int, ndim=1] a_indices,
-						np.ndarray[double, ndim=1] a_data,
-						np.ndarray[int, ndim=1] b_indptr,
-						np.ndarray[int, ndim=1] b_indices,
-						np.ndarray[double, ndim=1] b_data,
-						int ntop,
-						double lower_bound,
-						np.ndarray[int, ndim=1] c_indptr,
-						np.ndarray[int, ndim=1] c_indices,
-						np.ndarray[double, ndim=1] c_data,
-						np.ndarray[int, ndim=1] nminmax,
-					):
-	"""
-	Cython glue function to call sparse_dot_topn C++ implementation
-	This function will return a matrix C in CSR format, where
-	C = [sorted top n results > lower_bound for each row of A * B]
-	The maximum number of elements per row of C nminmax is also returned.
-
-	Input:
-		n_row: number of rows of A matrix
-		n_col: number of columns of B matrix
-
-		a_indptr, a_indices, a_data: CSR expression of A matrix
-		b_indptr, b_indices, b_data: CSR expression of B matrix
-
-		ntop: n top results
-		lower_bound: a threshold that the element of A*B must greater than
-
-	Output by reference:
-		c_indptr, c_indices, c_data: CSR expression of C matrix
-		nminmax: The maximum number of elements per row of C
-
-	N.B. A and B must be CSR format!!!
-		 The type of input numpy array must be aligned with types of C++ function arguments!
-	"""
-
-	cdef int* Ap = &a_indptr[0]
-	cdef int* Aj = &a_indices[0]
-	cdef double* Ax = &a_data[0]
-	cdef int* Bp = &b_indptr[0]
-	cdef int* Bj = &b_indices[0]
-	cdef double* Bx = &b_data[0]
-	cdef int* Cp = &c_indptr[0]
-	cdef int* Cj = &c_indices[0]
-	cdef double* Cx = &c_data[0]
-	cdef int* n_minmax = &nminmax[0]
-
-	sparse_dot_topn_extd_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax)
-	return
+                        int n_row,
+                        int n_col,
+                        np.ndarray[int, ndim=1] a_indptr,
+                        np.ndarray[int, ndim=1] a_indices,
+                        np.ndarray[double, ndim=1] a_data,
+                        np.ndarray[int, ndim=1] b_indptr,
+                        np.ndarray[int, ndim=1] b_indices,
+                        np.ndarray[double, ndim=1] b_data,
+                        int ntop,
+                        double lower_bound,
+                        np.ndarray[int, ndim=1] c_indptr,
+                        np.ndarray[int, ndim=1] c_indices,
+                        np.ndarray[double, ndim=1] c_data,
+                        np.ndarray[int, ndim=1] nminmax,
+                    ):
+    """
+    Cython glue function to call sparse_dot_topn_extd C++
+    implementation.  This function will return a matrix C in CSR
+    format, where
+    C = [sorted top n results > lower_bound for each row of A * B]
+    The maximum number nminmax of elements per row of C (assuming 
+    n = number of columns of B) is also returned.
+
+    Input:
+        n_row: number of rows of A matrix
+        n_col: number of columns of B matrix
+
+        a_indptr, a_indices, a_data: CSR expression of A matrix
+        b_indptr, b_indices, b_data: CSR expression of B matrix
+
+        ntop: n, the number of topmost results > lower_bound for
+              each row of C
+        lower_bound: a threshold that the element of A*B must
+                     greater than
+
+    Output by reference:
+        c_indptr, c_indices, c_data: CSR expression of matrix C
+        nminmax: The maximum number of elements per row of C 
+                 (assuming ntop = n_col)
+
+    N.B. A and B must be CSR format!!!
+         The type of input numpy array must be aligned with types
+         of C++ function arguments!
+    """
+
+    cdef int* Ap = &a_indptr[0]
+    cdef int* Aj = &a_indices[0]
+    cdef double* Ax = &a_data[0]
+    cdef int* Bp = &b_indptr[0]
+    cdef int* Bj = &b_indices[0]
+    cdef double* Bx = &b_data[0]
+    cdef int* Cp = &c_indptr[0]
+    cdef int* Cj = &c_indices[0]
+    cdef double* Cx = &c_data[0]
+    cdef int* n_minmax = &nminmax[0]
+
+    sparse_dot_topn_extd_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax)
+    return
 
 cpdef sparse_dot_free(
-						int n_row,
-						int n_col,
-						np.ndarray[int, ndim=1] a_indptr,
-						np.ndarray[int, ndim=1] a_indices,
-						np.ndarray[double, ndim=1] a_data,
-						np.ndarray[int, ndim=1] b_indptr,
-						np.ndarray[int, ndim=1] b_indices,
-						np.ndarray[double, ndim=1] b_data,
-						double lower_bound,
-						np.ndarray[int, ndim=1] c_indptr
-					):
-	"""
-	Cython glue function to call sparse_dot_free C++ implementation
-	This function will return a matrix C in CSR format, where
-	C = [all results > lower_bound for each row of A * B]
-	This function lets C++ decide how to manage (grow/allocate/reallocate) memory for the 
-	storage of these results as needed during the computation; then hands over to numpy
-	a pointer to the memory location where the data resides  
-
-	Input:
-		n_row: number of rows of A matrix
-		n_col: number of columns of B matrix
-
-		a_indptr, a_indices, a_data: CSR expression of A matrix
-		b_indptr, b_indices, b_data: CSR expression of B matrix
-
-		lower_bound: a threshold that the element of A*B must greater than
-
-	Output by reference:
-		c_indptr, c_indices, c_data: CSR expression of C matrix
-
-	N.B. A and B must be CSR format!!!
-		 The type of input numpy array must be aligned with types of C++ function arguments!
-	"""
-
-	cdef int* Ap = &a_indptr[0]
-	cdef int* Aj = &a_indices[0]
-	cdef double* Ax = &a_data[0]
-	cdef int* Bp = &b_indptr[0]
-	cdef int* Bj = &b_indices[0]
-	cdef double* Bx = &b_data[0]
-	cdef int* Cp = &c_indptr[0]
-	cdef np.ndarray[int, ndim=1] nminmax = np.array([0], dtype=np.int32)
-	cdef int* n_minmax = &nminmax[0]
-	
-	cdef vector[int] vCj;
-	cdef vector[double] vCx;
-
-	sparse_dot_free_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx, n_minmax)
-	
-	c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0)
-	c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0)
-	
-	return c_indices, c_data, nminmax[0]
+                        int n_row,
+                        int n_col,
+                        np.ndarray[int, ndim=1] a_indptr,
+                        np.ndarray[int, ndim=1] a_indices,
+                        np.ndarray[double, ndim=1] a_data,
+                        np.ndarray[int, ndim=1] b_indptr,
+                        np.ndarray[int, ndim=1] b_indices,
+                        np.ndarray[double, ndim=1] b_data,
+                        int ntop,
+                        double lower_bound,
+                        np.ndarray[int, ndim=1] c_indptr
+                    ):
+    """
+    Cython glue function to call sparse_dot_free C++ implementation
+    This function will return a matrix C in CSR format, where
+    C = [sorted top n results > lower_bound for each row of A * B]
+    The maximum number nminmax of elements per row of C (assuming 
+    n = number of columns of B) is also returned.
+
+    Input:
+        n_row: number of rows of A matrix
+        n_col: number of columns of B matrix
+
+        a_indptr, a_indices, a_data: CSR expression of A matrix
+        b_indptr, b_indices, b_data: CSR expression of B matrix
+
+        ntop: n, the number of topmost results > lower_bound for 
+              each row of C
+        lower_bound: a threshold that the element of A*B must 
+                     greater than
+
+    Output by reference:
+        c_indptr: index-pointer of the CSR expression of matrix C
+
+    Returned Output:
+        c_indices, c_data: indices and data of the CSR expression
+                           of matrix C
+        nminmax: The maximum number of elements per row of C
+                 (assuming ntop = n_col)
+
+    N.B. A and B must be CSR format!!!
+         The type of input numpy array must be aligned with types
+         of C++ function arguments!
+    """
+
+    cdef int* Ap = &a_indptr[0]
+    cdef int* Aj = &a_indices[0]
+    cdef double* Ax = &a_data[0]
+    cdef int* Bp = &b_indptr[0]
+    cdef int* Bj = &b_indices[0]
+    cdef double* Bx = &b_data[0]
+    cdef int* Cp = &c_indptr[0]
+    cdef np.ndarray[int, ndim=1] nminmax = np.array([0], dtype=np.int32)
+    cdef int* n_minmax = &nminmax[0]
+    
+    cdef vector[int] vCj;
+    cdef vector[double] vCx;
+
+    sparse_dot_free_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, &vCj, &vCx, n_minmax)
+    
+    c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0)
+    c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0)
+    
+    return c_indices, c_data, nminmax[0]
 
 
 cpdef sparse_dot_only_max_nnz_col(
-									int n_row,
-									int n_col,
-									np.ndarray[int, ndim=1] a_indptr,
-									np.ndarray[int, ndim=1] a_indices,
-									np.ndarray[int, ndim=1] b_indptr,
-									np.ndarray[int, ndim=1] b_indices,
-									np.ndarray[int, ndim=1] minmax_topn
-								):
-	"""
-	Cython glue function to call sparse_dot_only_minmax_topn C++ implementation
-	This function will return the maximum number of columns set
-	per row over all rows of A * B
-
-	Input:
-		n_row: number of rows of A matrix
-		n_col: number of columns of B matrix
-
-		a_indptr, a_indices: CSR indices of A matrix
-		b_indptr, b_indices: CSR indices of B matrix
-
-	Output by reference:
-		minmax_ntop: the maximum number of columns set per row over all rows of 
-					 A * B
-
-	N.B. A and B must be CSR format!!!
-		 The type of input numpy array must be aligned with types of C++ function arguments!
-	"""
-
-	cdef int* Ap = &a_indptr[0]
-	cdef int* Aj = &a_indices[0]
-	cdef int* Bp = &b_indptr[0]
-	cdef int* Bj = &b_indices[0]
-	cdef int* o_minmax_topn = &minmax_topn[0]
-
-	sparse_dot_only_max_nnz_col_source(n_row, n_col, Ap, Aj, Bp, Bj, o_minmax_topn)
-	return
+                                    int n_row,
+                                    int n_col,
+                                    np.ndarray[int, ndim=1] a_indptr,
+                                    np.ndarray[int, ndim=1] a_indices,
+                                    np.ndarray[int, ndim=1] b_indptr,
+                                    np.ndarray[int, ndim=1] b_indices,
+                                    np.ndarray[int, ndim=1] minmax_topn
+                                ):
+    """
+    Cython glue function to call sparse_dot_only_minmax_topn C++ implementation
+    This function will return the maximum number of columns set
+    per row over all rows of A * B
+
+    Input:
+        n_row: number of rows of A matrix
+        n_col: number of columns of B matrix
+
+        a_indptr, a_indices: CSR indices of A matrix
+        b_indptr, b_indices: CSR indices of B matrix
+
+    Output by reference:
+        minmax_ntop: the maximum number of columns set per row over all rows of 
+                     A * B
+
+    N.B. A and B must be CSR format!!!
+         The type of input numpy array must be aligned with types of C++ function arguments!
+    """
+
+    cdef int* Ap = &a_indptr[0]
+    cdef int* Aj = &a_indices[0]
+    cdef int* Bp = &b_indptr[0]
+    cdef int* Bj = &b_indices[0]
+    cdef int* o_minmax_topn = &minmax_topn[0]
+
+    sparse_dot_only_max_nnz_col_source(n_row, n_col, Ap, Aj, Bp, Bj, o_minmax_topn)
+    return
diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/sparse_dot_topn/sparse_dot_topn_parallel.cpp
index fa37746f..19efe7d2 100644
--- a/sparse_dot_topn/sparse_dot_topn_parallel.cpp
+++ b/sparse_dot_topn/sparse_dot_topn_parallel.cpp
@@ -419,6 +419,7 @@ void sparse_dot_topn_extd_parallel(
 void inner_sparse_dot_free(
 		job_range_type job_range,
 		int n_col_inner,
+		int ntop_inner,
 		double lower_bound_inner,
 		int Ap_copy[],
 		int Aj_copy[],
@@ -485,18 +486,29 @@ void inner_sparse_dot_free(
 		}
 
 		int len = (int) (real_candidates->size() - sz);
+		*n_minmax = (len > *n_minmax)? len : *n_minmax;
 
 		candidate* candidate_arr_begin = real_candidates->data() + sz;
-		std::sort(
-				candidate_arr_begin,
-				candidate_arr_begin + len,
-				candidate_cmp
-		);
+		if (len > ntop_inner){
+			std::partial_sort(
+					candidate_arr_begin,
+					candidate_arr_begin + ntop_inner,
+					candidate_arr_begin + len,
+					candidate_cmp
+			);
+			len = ntop_inner;
+		}
+		else {
+			std::sort(
+					candidate_arr_begin,
+					candidate_arr_begin + len,
+					candidate_cmp
+			);
+		}
 
 		real_candidates->resize(sz + (size_t) len);
 		*(row_sizes_ptr++) = len;
 		(*total) += len;
-		*n_minmax = (len > *n_minmax)? len : *n_minmax;
 	}
 	real_candidates->shrink_to_fit();
 }
@@ -510,6 +522,7 @@ void sparse_dot_free_parallel(
 		int Bp[],
 		int Bj[],
 		double Bx[], //data of B
+		int ntop,
 		double lower_bound,
 		int Cp[],
 		std::vector<int>* vCj,
@@ -536,7 +549,7 @@ void sparse_dot_free_parallel(
 				inner_sparse_dot_free,
 				job_ranges[job_nr],
 				n_col,
-				lower_bound,
+				ntop, lower_bound,
 				Ap, Aj, Ax, Bp, Bj, Bx,
 				&real_candidates[job_nr],
 				&row_sizes[job_nr],
diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.h b/sparse_dot_topn/sparse_dot_topn_parallel.h
index 30dc24ef..716ca04e 100644
--- a/sparse_dot_topn/sparse_dot_topn_parallel.h
+++ b/sparse_dot_topn/sparse_dot_topn_parallel.h
@@ -67,6 +67,7 @@ extern void sparse_dot_free_parallel(
 		int Bp[],
 		int Bj[],
 		double Bx[], //data of B
+		int ntop,
 		double lower_bound,
 		int Cp[],
 		std::vector<int>* Cj,
diff --git a/sparse_dot_topn/sparse_dot_topn_source.cpp b/sparse_dot_topn/sparse_dot_topn_source.cpp
index f0400f0e..17b8b121 100644
--- a/sparse_dot_topn/sparse_dot_topn_source.cpp
+++ b/sparse_dot_topn/sparse_dot_topn_source.cpp
@@ -250,8 +250,9 @@ void sparse_dot_topn_extd_source(
 	C++ implementation of sparse_dot_free_source
 
 	This function will return a matrix C in CSR format, where
-	C = [all results > lower_bound sorted for each row of A * B].
-	It also returns the maximum number of elements per row of C.
+	C = [sorted top n results > lower_bound for each row of A * B].
+	The maximum number n_minmax of elements per row of C (assuming ntop = n_col)
+	is also returned.
 
 	Input:
 		n_row: number of rows of A matrix
@@ -260,7 +261,7 @@ void sparse_dot_topn_extd_source(
 		Ap, Aj, Ax: CSR expression of A matrix
 		Bp, Bj, Bx: CSR expression of B matrix
 
-		memory_bound: the maximum number of elements per row of C
+		ntop: n top results
 		lower_bound: a threshold that the element of A*B must greater than
 
 	Output by reference:
@@ -280,6 +281,7 @@ void sparse_dot_free_source(
 		int Bp[],
 		int Bj[],
 		double Bx[], //data of B
+		int ntop,
 		double lower_bound,
 		int Cp[],
 		std::vector<int>* Cj,
@@ -342,7 +344,13 @@ void sparse_dot_free_source(
 
 		int len = (int)candidates.size();
 		*n_minmax = (len > *n_minmax)? len : *n_minmax;
-		std::sort(candidates.begin(), candidates.end(), candidate_cmp);
+
+		if (len > ntop){
+			std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp);
+			len = ntop;
+		} else {
+			std::sort(candidates.begin(), candidates.end(), candidate_cmp);
+		}
 
 		for(int a=0; a < len; a++){
 			Cj->push_back(candidates[a].index);
@@ -350,10 +358,8 @@ void sparse_dot_free_source(
 		}
 		candidates.clear();
 
-		Cp[i+1] = (int) (Cj->size());
+		Cp[i+1] = Cj->size();
 	}
-	Cj->shrink_to_fit();
-	Cx->shrink_to_fit();
 }
 
 /*
diff --git a/sparse_dot_topn/sparse_dot_topn_source.h b/sparse_dot_topn/sparse_dot_topn_source.h
index 723e9acc..9580d1cf 100644
--- a/sparse_dot_topn/sparse_dot_topn_source.h
+++ b/sparse_dot_topn/sparse_dot_topn_source.h
@@ -70,6 +70,7 @@ extern void sparse_dot_free_source(
 		int Bp[],
 		int Bj[],
 		double Bx[], //data of B
+		int ntop,
 		double lower_bound,
 		int Cp[],
 		std::vector<int>* Cj,
diff --git a/sparse_dot_topn/sparse_dot_topn_threaded.pyx b/sparse_dot_topn/sparse_dot_topn_threaded.pyx
index 8b003f08..2f858444 100644
--- a/sparse_dot_topn/sparse_dot_topn_threaded.pyx
+++ b/sparse_dot_topn/sparse_dot_topn_threaded.pyx
@@ -75,6 +75,7 @@ cdef extern from "sparse_dot_topn_parallel.h":
 										int Bp[],
 										int Bj[],
 										double Bx[],
+										int ntop,
 										double lower_bound,
 										int Cp[],
 										vector[int]* Cj,
@@ -167,6 +168,7 @@ cpdef sparse_dot_free_threaded(
 								np.ndarray[int, ndim=1] b_indptr,
 								np.ndarray[int, ndim=1] b_indices,
 								np.ndarray[double, ndim=1] b_data,
+								int ntop,
 								double lower_bound,
 								np.ndarray[int, ndim=1] c_indptr,
 								int n_jobs
@@ -185,7 +187,7 @@ cpdef sparse_dot_free_threaded(
 	cdef vector[int] vCj;
 	cdef vector[double] vCx;
 
-	sparse_dot_free_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx, n_minmax, n_jobs)
+	sparse_dot_free_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, &vCj, &vCx, n_minmax, n_jobs)
 	
 	c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0)
 	c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0)
diff --git a/sparse_dot_topn/test/test_awesome_cossim_topn.py b/sparse_dot_topn/test/test_awesome_cossim_topn.py
index fb0d67ab..ba7dfbfc 100644
--- a/sparse_dot_topn/test/test_awesome_cossim_topn.py
+++ b/sparse_dot_topn/test/test_awesome_cossim_topn.py
@@ -62,8 +62,15 @@ def helper_awesome_cossim_topn_dense(
         use_threads=use_threads,
         n_jobs=n_jobs
     )
-    awesome_result_top3 = \
-        awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, 0.0, use_threads=use_threads, n_jobs=n_jobs)
+    awesome_result_top3 = awesome_cossim_topn(
+        a_csr,
+        b_csr_t,
+        NUM_CANDIDATES,
+        0.0,
+        mem_manager_is_C=mem_manager_is_C,
+        use_threads=use_threads,
+        n_jobs=n_jobs
+    )
     awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
         row.data) > 0 else None for row in awesome_result_top3]  # make comparable, normally not needed
 
@@ -76,8 +83,15 @@ def helper_awesome_cossim_topn_dense(
         use_threads=use_threads,
         n_jobs=n_jobs
     )
-    pruned_awesome_result_top3 = \
-        awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, PRUNE_THRESHOLD, use_threads=use_threads, n_jobs=n_jobs)
+    pruned_awesome_result_top3 = awesome_cossim_topn(
+        a_csr,
+        b_csr_t,
+        NUM_CANDIDATES,
+        PRUNE_THRESHOLD,
+        mem_manager_is_C=mem_manager_is_C,
+        use_threads=use_threads,
+        n_jobs=n_jobs
+    )
     pruned_awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
         row.data) > 0 else None for row in pruned_awesome_result_top3]
 
@@ -131,8 +145,15 @@ def helper_awesome_cossim_topn_sparse(
         use_threads=use_threads,
         n_jobs=n_jobs
     )
-    awesome_result_top3 = \
-        awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, 0.0, use_threads=use_threads, n_jobs=n_jobs)
+    awesome_result_top3 = awesome_cossim_topn(
+        a_csr,
+        b_csr_t,
+        NUM_CANDIDATES,
+        0.0,
+        mem_manager_is_C=mem_manager_is_C,
+        use_threads=use_threads,
+        n_jobs=n_jobs
+    )
     awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
         row.data) > 0 else None for row in awesome_result_top3]  # make comparable, normally not needed
 
@@ -145,8 +166,15 @@ def helper_awesome_cossim_topn_sparse(
         use_threads=use_threads,
         n_jobs=n_jobs
     )
-    pruned_awesome_result_top3 = \
-        awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, PRUNE_THRESHOLD, use_threads=use_threads, n_jobs=n_jobs)
+    pruned_awesome_result_top3 = awesome_cossim_topn(
+        a_csr,
+        b_csr_t,
+        NUM_CANDIDATES,
+        PRUNE_THRESHOLD,
+        mem_manager_is_C=mem_manager_is_C,
+        use_threads=use_threads,
+        n_jobs=n_jobs
+    )
     pruned_awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
         row.data) > 0 else None for row in pruned_awesome_result_top3]
 
diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index 33bfb0e6..d3eb07c6 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -218,9 +218,13 @@ def __init__(self, master: pd.Series,
         self._duplicates: pd.Series = duplicates if duplicates is not None else None
         self._master_id: pd.Series = master_id if master_id is not None else None
         self._duplicates_id: pd.Series = duplicates_id if duplicates_id is not None else None
+
         self._config: StringGrouperConfig = StringGrouperConfig(**kwargs)
-        self._max_n_matches = len(self._master) if self._config.max_n_matches is None \
-            else self._config.max_n_matches
+        if self._config.max_n_matches is None:
+            self._max_n_matches = len(self._master) if self._duplicates is None else len(self._duplicates)
+        else:
+            self._max_n_matches = self._config.max_n_matches
+
         self._validate_group_rep_specs()
         self._validate_replace_na_and_drop()
         self.is_build = False  # indicates if the grouper was fit or not
@@ -435,7 +439,6 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix
         optional_kwargs = dict()
         if self._config.number_of_processes > 1:
             optional_kwargs = {
-                'ntop_is_flexible': self._config.max_n_matches is None,
                 'return_best_topn': True,
                 'use_threads': True,
                 'n_jobs': self._config.number_of_processes

From 4b86ab1957d1004ec486184b028fa1edfd55fba3 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Mon, 26 Apr 2021 14:24:37 +0200
Subject: [PATCH 10/29] removed code-redundancies in sparse_dot_topn

---
 sparse_dot_topn/sparse_dot_topn.pyx          | 214 ++++++++---------
 sparse_dot_topn/sparse_dot_topn_parallel.cpp | 154 +++---------
 sparse_dot_topn/sparse_dot_topn_threaded.pyx | 232 ++++++++++---------
 3 files changed, 254 insertions(+), 346 deletions(-)

diff --git a/sparse_dot_topn/sparse_dot_topn.pyx b/sparse_dot_topn/sparse_dot_topn.pyx
index 1d9e751a..580c0f2f 100644
--- a/sparse_dot_topn/sparse_dot_topn.pyx
+++ b/sparse_dot_topn/sparse_dot_topn.pyx
@@ -31,80 +31,80 @@ np.import_array()
 cdef extern from "sparse_dot_topn_source.h":
 
     cdef void sparse_dot_topn_source(
-                                        int n_row,
-                                        int n_col,
-                                        int Ap[],
-                                        int Aj[],
-                                        double Ax[],
-                                        int Bp[],
-                                        int Bj[],
-                                        double Bx[],
-                                        int topn,
-                                        double lower_bound,
-                                        int Cp[],
-                                        int Cj[],
-                                        double Cx[]
-                                    );
+        int n_row,
+        int n_col,
+        int Ap[],
+        int Aj[],
+        double Ax[],
+        int Bp[],
+        int Bj[],
+        double Bx[],
+        int topn,
+        double lower_bound,
+        int Cp[],
+        int Cj[],
+        double Cx[]
+    );
 
     cdef void sparse_dot_topn_extd_source(
-                                        int n_row,
-                                        int n_col,
-                                        int Ap[],
-                                        int Aj[],
-                                        double Ax[],
-                                        int Bp[],
-                                        int Bj[],
-                                        double Bx[],
-                                        int topn,
-                                        double lower_bound,
-                                        int Cp[],
-                                        int Cj[],
-                                        double Cx[],
-                                        int* nminmax
-                                    );
+        int n_row,
+        int n_col,
+        int Ap[],
+        int Aj[],
+        double Ax[],
+        int Bp[],
+        int Bj[],
+        double Bx[],
+        int topn,
+        double lower_bound,
+        int Cp[],
+        int Cj[],
+        double Cx[],
+        int* nminmax
+    );
 
     cdef void sparse_dot_free_source(
-                                        int n_row,
-                                        int n_col,
-                                        int Ap[],
-                                        int Aj[],
-                                        double Ax[],
-                                        int Bp[],
-                                        int Bj[],
-                                        double Bx[],
-                                        int ntop,
-                                        double lower_bound,
-                                        int Cp[],
-                                        vector[int]* Cj,
-                                        vector[double]* Cx,
-                                        int* n_minmax
-                                    );
+        int n_row,
+        int n_col,
+        int Ap[],
+        int Aj[],
+        double Ax[],
+        int Bp[],
+        int Bj[],
+        double Bx[],
+        int ntop,
+        double lower_bound,
+        int Cp[],
+        vector[int]* Cj,
+        vector[double]* Cx,
+        int* n_minmax
+    );
 
     cdef void sparse_dot_only_max_nnz_col_source(
-                                                    int n_row,
-                                                    int n_col,
-                                                    int Ap[],
-                                                    int Aj[],
-                                                    int Bp[],
-                                                    int Bj[],
-                                                    int* max_nnz_col
-                                                );
+        int n_row,
+        int n_col,
+        int Ap[],
+        int Aj[],
+        int Bp[],
+        int Bj[],
+        int* max_nnz_col
+    );
 
 cpdef sparse_dot_topn(
-                        int n_row,
-                        int n_col,
-                        np.ndarray[int, ndim=1] a_indptr,
-                        np.ndarray[int, ndim=1] a_indices,
-                        np.ndarray[double, ndim=1] a_data,
-                        np.ndarray[int, ndim=1] b_indptr,
-                        np.ndarray[int, ndim=1] b_indices,
-                        np.ndarray[double, ndim=1] b_data,
-                        int ntop,
-                        double lower_bound,
-                        np.ndarray[int, ndim=1] c_indptr,
-                        np.ndarray[int, ndim=1] c_indices,
-                        np.ndarray[double, ndim=1] c_data
-                    ):
+    int n_row,
+    int n_col,
+    np.ndarray[int, ndim=1] a_indptr,
+    np.ndarray[int, ndim=1] a_indices,
+    np.ndarray[double, ndim=1] a_data,
+    np.ndarray[int, ndim=1] b_indptr,
+    np.ndarray[int, ndim=1] b_indices,
+    np.ndarray[double, ndim=1] b_data,
+    int ntop,
+    double lower_bound,
+    np.ndarray[int, ndim=1] c_indptr,
+    np.ndarray[int, ndim=1] c_indices,
+    np.ndarray[double, ndim=1] c_data
+):
     """
     Cython glue function to call sparse_dot_topn C++ implementation
     This function will return a matrix C in CSR format, where
@@ -137,25 +137,27 @@ cpdef sparse_dot_topn(
     cdef int* Cj = &c_indices[0]
     cdef double* Cx = &c_data[0]
 
-    sparse_dot_topn_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx)
+    sparse_dot_topn_source(
+        n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx
+    )
     return
 
 cpdef sparse_dot_topn_extd(
-                        int n_row,
-                        int n_col,
-                        np.ndarray[int, ndim=1] a_indptr,
-                        np.ndarray[int, ndim=1] a_indices,
-                        np.ndarray[double, ndim=1] a_data,
-                        np.ndarray[int, ndim=1] b_indptr,
-                        np.ndarray[int, ndim=1] b_indices,
-                        np.ndarray[double, ndim=1] b_data,
-                        int ntop,
-                        double lower_bound,
-                        np.ndarray[int, ndim=1] c_indptr,
-                        np.ndarray[int, ndim=1] c_indices,
-                        np.ndarray[double, ndim=1] c_data,
-                        np.ndarray[int, ndim=1] nminmax,
-                    ):
+    int n_row,
+    int n_col,
+    np.ndarray[int, ndim=1] a_indptr,
+    np.ndarray[int, ndim=1] a_indices,
+    np.ndarray[double, ndim=1] a_data,
+    np.ndarray[int, ndim=1] b_indptr,
+    np.ndarray[int, ndim=1] b_indices,
+    np.ndarray[double, ndim=1] b_data,
+    int ntop,
+    double lower_bound,
+    np.ndarray[int, ndim=1] c_indptr,
+    np.ndarray[int, ndim=1] c_indices,
+    np.ndarray[double, ndim=1] c_data,
+    np.ndarray[int, ndim=1] nminmax,
+):
     """
     Cython glue function to call sparse_dot_topn_extd C++
     implementation.  This function will return a matrix C in CSR
@@ -197,22 +199,24 @@ cpdef sparse_dot_topn_extd(
     cdef double* Cx = &c_data[0]
     cdef int* n_minmax = &nminmax[0]
 
-    sparse_dot_topn_extd_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax)
+    sparse_dot_topn_extd_source(
+        n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax
+    )
     return
 
 cpdef sparse_dot_free(
-                        int n_row,
-                        int n_col,
-                        np.ndarray[int, ndim=1] a_indptr,
-                        np.ndarray[int, ndim=1] a_indices,
-                        np.ndarray[double, ndim=1] a_data,
-                        np.ndarray[int, ndim=1] b_indptr,
-                        np.ndarray[int, ndim=1] b_indices,
-                        np.ndarray[double, ndim=1] b_data,
-                        int ntop,
-                        double lower_bound,
-                        np.ndarray[int, ndim=1] c_indptr
-                    ):
+    int n_row,
+    int n_col,
+    np.ndarray[int, ndim=1] a_indptr,
+    np.ndarray[int, ndim=1] a_indices,
+    np.ndarray[double, ndim=1] a_data,
+    np.ndarray[int, ndim=1] b_indptr,
+    np.ndarray[int, ndim=1] b_indices,
+    np.ndarray[double, ndim=1] b_data,
+    int ntop,
+    double lower_bound,
+    np.ndarray[int, ndim=1] c_indptr
+):
     """
     Cython glue function to call sparse_dot_free C++ implementation
     This function will return a matrix C in CSR format, where
@@ -259,7 +263,9 @@ cpdef sparse_dot_free(
     cdef vector[int] vCj;
     cdef vector[double] vCx;
 
-    sparse_dot_free_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, &vCj, &vCx, n_minmax)
+    sparse_dot_free_source(
+        n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, &vCj, &vCx, n_minmax
+    )
     
     c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0)
     c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0)
@@ -268,14 +274,14 @@ cpdef sparse_dot_free(
 
 
 cpdef sparse_dot_only_max_nnz_col(
-                                    int n_row,
-                                    int n_col,
-                                    np.ndarray[int, ndim=1] a_indptr,
-                                    np.ndarray[int, ndim=1] a_indices,
-                                    np.ndarray[int, ndim=1] b_indptr,
-                                    np.ndarray[int, ndim=1] b_indices,
-                                    np.ndarray[int, ndim=1] minmax_topn
-                                ):
+    int n_row,
+    int n_col,
+    np.ndarray[int, ndim=1] a_indptr,
+    np.ndarray[int, ndim=1] a_indices,
+    np.ndarray[int, ndim=1] b_indptr,
+    np.ndarray[int, ndim=1] b_indices,
+    np.ndarray[int, ndim=1] minmax_topn
+):
     """
     Cython glue function to call sparse_dot_only_minmax_topn C++ implementation
     This function will return the maximum number of columns set
diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/sparse_dot_topn/sparse_dot_topn_parallel.cpp
index 19efe7d2..1b06e927 100644
--- a/sparse_dot_topn/sparse_dot_topn_parallel.cpp
+++ b/sparse_dot_topn/sparse_dot_topn_parallel.cpp
@@ -21,15 +21,14 @@
 // April 14, 2021
 
 #include <vector>
-#include <limits>
 #include <algorithm>
 #include <numeric>
 #include <thread>
-#include <iostream>
 
 #include "./sparse_dot_topn_source.h"
 #include "./sparse_dot_topn_parallel.h"
 
+
 struct job_range_type {int begin; int end;};
 
 void distribute_load(
@@ -59,7 +58,7 @@ void inner_gather_function(
 		int vCj_start[],
 		double vCx_start[],
 		std::vector<candidate>* real_candidates,
-		std::vector<int>* row_sizes
+		std::vector<int>* row_nnz
 )
 {
 	candidate* c = real_candidates->data();
@@ -67,17 +66,16 @@ void inner_gather_function(
 	double* vCx_cursor = &vCx_start[Cp_start];
 
 	int Cp_i = Cp_start;
-	int* row_sizes_ptr = row_sizes->data();
+	int* row_nnz_ptr = row_nnz->data();
 
 	for (int i = job_range.begin; i < job_range.end; i++){
-		for (int j = 0; j < (*row_sizes_ptr); j++){
+		for (int j = 0; j < (*row_nnz_ptr); j++){
 			*(vCj_cursor++) = c->index;
 			*(vCx_cursor++) = (c++)->value;
 		}
-		Cp_i += *(row_sizes_ptr++);
+		Cp_i += *(row_nnz_ptr++);
 		Cp[i + 1] = Cp_i;
 	}
-	real_candidates->clear();
 }
 
 void inner_sparse_dot_topn(
@@ -92,7 +90,7 @@ void inner_sparse_dot_topn(
 		int Bj_copy[],
 		double Bx_copy[],
 		std::vector<candidate>* real_candidates,
-		std::vector<int>* row_sizes,
+		std::vector<int>* row_nnz,
 		int* total
 )
 {
@@ -101,8 +99,8 @@ void inner_sparse_dot_topn(
 
 	real_candidates->reserve(job_range.end - job_range.begin);
 
-	row_sizes->resize(job_range.end - job_range.begin);
-	int* row_sizes_ptr = row_sizes->data();
+	row_nnz->resize(job_range.end - job_range.begin);
+	int* row_nnz_ptr = row_nnz->data();
 
 	for (int i = job_range.begin; i < job_range.end; i++){
 
@@ -169,7 +167,7 @@ void inner_sparse_dot_topn(
 		}
 
 		real_candidates->resize(sz + (size_t) len);
-		*(row_sizes_ptr++) = len;
+		*(row_nnz_ptr++) = len;
 		(*total) += len;
 	}
 	real_candidates->shrink_to_fit();
@@ -195,8 +193,8 @@ void sparse_dot_topn_parallel(
 	std::vector<job_range_type> job_ranges(n_jobs);
 	distribute_load(n_row, n_jobs, job_ranges);
 
-	std::vector<std::vector<candidate> > real_candidates(n_jobs);
-	std::vector<std::vector<int>> row_sizes(n_jobs);
+	std::vector<std::vector<candidate>> real_candidates(n_jobs);
+	std::vector<std::vector<int>> row_nnz(n_jobs);
 
 	// initialize aggregate:
 	std::vector<int> sub_total(n_jobs, 0);
@@ -211,7 +209,7 @@ void sparse_dot_topn_parallel(
 				lower_bound,
 				Ap, Aj, Ax, Bp, Bj, Bx,
 				&real_candidates[job_nr],
-				&row_sizes[job_nr],
+				&row_nnz[job_nr],
 				&sub_total[job_nr]
 		);
 	}
@@ -235,13 +233,12 @@ void sparse_dot_topn_parallel(
 				Cj,
 				Cx,
 				&real_candidates[job_nr],
-				&row_sizes[job_nr]
+				&row_nnz[job_nr]
 		);
 	}
 
 	for (int job_nr = 0; job_nr < n_jobs; job_nr++)
 		thread_list[job_nr].join();
-
 }
 
 void inner_sparse_dot_topn_extd(
@@ -256,7 +253,7 @@ void inner_sparse_dot_topn_extd(
 		int Bj_copy[],
 		double Bx_copy[],
 		std::vector<candidate>* real_candidates,
-		std::vector<int>* row_sizes,
+		std::vector<int>* row_nnz,
 		int* total,
 		int* n_minmax
 )
@@ -266,8 +263,8 @@ void inner_sparse_dot_topn_extd(
 
 	real_candidates->reserve(job_range.end - job_range.begin);
 
-	row_sizes->resize(job_range.end - job_range.begin);
-	int* row_sizes_ptr = row_sizes->data();
+	row_nnz->resize(job_range.end - job_range.begin);
+	int* row_nnz_ptr = row_nnz->data();
 
 	for(int i = job_range.begin; i < job_range.end; i++){
 
@@ -335,7 +332,7 @@ void inner_sparse_dot_topn_extd(
 		}
 
 		real_candidates->resize(sz + (size_t) len);
-		*(row_sizes_ptr++) = len;
+		*(row_nnz_ptr++) = len;
 		(*total) += len;
 	}
 	real_candidates->shrink_to_fit();
@@ -362,8 +359,8 @@ void sparse_dot_topn_extd_parallel(
 	std::vector<job_range_type> job_ranges(n_jobs);
 	distribute_load(n_row, n_jobs, job_ranges);
 
-	std::vector<std::vector<candidate> > real_candidates(n_jobs);
-	std::vector<std::vector<int>> row_sizes(n_jobs);
+	std::vector<std::vector<candidate>> real_candidates(n_jobs);
+	std::vector<std::vector<int>> row_nnz(n_jobs);
 
 	// initialize aggregates:
 	std::vector<int> sub_total(n_jobs, 0);
@@ -380,7 +377,7 @@ void sparse_dot_topn_extd_parallel(
 				lower_bound,
 				Ap, Aj, Ax, Bp, Bj, Bx,
 				&real_candidates[job_nr],
-				&row_sizes[job_nr],
+				&row_nnz[job_nr],
 				&sub_total[job_nr],
 				&split_n_minmax[job_nr]
 		);
@@ -407,110 +404,12 @@ void sparse_dot_topn_extd_parallel(
 				Cj,
 				Cx,
 				&real_candidates[job_nr],
-				&row_sizes[job_nr]
+				&row_nnz[job_nr]
 		);
 	}
 
 	for (int job_nr = 0; job_nr < n_jobs; job_nr++)
 		thread_list[job_nr].join();
-
-}
-
-void inner_sparse_dot_free(
-		job_range_type job_range,
-		int n_col_inner,
-		int ntop_inner,
-		double lower_bound_inner,
-		int Ap_copy[],
-		int Aj_copy[],
-		double Ax_copy[],
-		int Bp_copy[],
-		int Bj_copy[],
-		double Bx_copy[],
-		std::vector<candidate>* real_candidates,
-		std::vector<int>* row_sizes,
-		int* total,
-		int* n_minmax
-)
-{
-	std::vector<int> next(n_col_inner,-1);
-	std::vector<double> sums(n_col_inner, 0);
-
-	real_candidates->reserve(job_range.end - job_range.begin);
-
-	row_sizes->resize(job_range.end - job_range.begin);
-	int* row_sizes_ptr = row_sizes->data();
-
-	for(int i = job_range.begin; i < job_range.end; i++){
-
-		int head   = -2;
-		int length =  0;
-		size_t sz = real_candidates->size();
-
-		int jj_start = Ap_copy[i];
-		int jj_end   = Ap_copy[i+1];
-
-		for(int jj = jj_start; jj < jj_end; jj++){
-			int j = Aj_copy[jj];
-			double v = Ax_copy[jj]; //value of A in (i,j)
-
-			int kk_start = Bp_copy[j];
-			int kk_end   = Bp_copy[j+1];
-			for(int kk = kk_start; kk < kk_end; kk++){
-				int k = Bj_copy[kk]; //kth column of B in row j
-
-				sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
-
-				if(next[k] == -1){
-					next[k] = head; //keep a linked list, every element points to the next column index
-					head  = k;
-					length++;
-				}
-			}
-		}
-
-		for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
-
-			if(sums[head] > lower_bound_inner){ //append the nonzero elements
-				candidate c;
-				c.index = head;
-				c.value = sums[head];
-				real_candidates->push_back(c);
-			}
-
-			int temp = head;
-			head = next[head]; //iterate over columns
-
-			next[temp] = -1; //clear arrays
-			sums[temp] =  0; //clear arrays
-		}
-
-		int len = (int) (real_candidates->size() - sz);
-		*n_minmax = (len > *n_minmax)? len : *n_minmax;
-
-		candidate* candidate_arr_begin = real_candidates->data() + sz;
-		if (len > ntop_inner){
-			std::partial_sort(
-					candidate_arr_begin,
-					candidate_arr_begin + ntop_inner,
-					candidate_arr_begin + len,
-					candidate_cmp
-			);
-			len = ntop_inner;
-		}
-		else {
-			std::sort(
-					candidate_arr_begin,
-					candidate_arr_begin + len,
-					candidate_cmp
-			);
-		}
-
-		real_candidates->resize(sz + (size_t) len);
-		*(row_sizes_ptr++) = len;
-		(*total) += len;
-	}
-	real_candidates->shrink_to_fit();
 }
 
 void sparse_dot_free_parallel(
@@ -534,8 +433,8 @@ void sparse_dot_free_parallel(
 	std::vector<job_range_type> job_ranges(n_jobs);
 	distribute_load(n_row, n_jobs, job_ranges);
 
-	std::vector<std::vector<candidate> > real_candidates(n_jobs);
-	std::vector<std::vector<int>> row_sizes(n_jobs);
+	std::vector<std::vector<candidate>> real_candidates(n_jobs);
+	std::vector<std::vector<int>> row_nnz(n_jobs);
 
 	// initialize aggregates:
 	std::vector<int> sub_total(n_jobs, 0);
@@ -546,13 +445,13 @@ void sparse_dot_free_parallel(
 	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
 
 		thread_list[job_nr] = std::thread (
-				inner_sparse_dot_free,
+				inner_sparse_dot_topn_extd,
 				job_ranges[job_nr],
 				n_col,
 				ntop, lower_bound,
 				Ap, Aj, Ax, Bp, Bj, Bx,
 				&real_candidates[job_nr],
-				&row_sizes[job_nr],
+				&row_nnz[job_nr],
 				&sub_total[job_nr],
 				&split_n_minmax[job_nr]
 		);
@@ -585,13 +484,12 @@ void sparse_dot_free_parallel(
 				&((*vCj)[0]),
 				&((*vCx)[0]),
 				&real_candidates[job_nr],
-				&row_sizes[job_nr]
+				&row_nnz[job_nr]
 		);
 	}
 
 	for (int job_nr = 0; job_nr < n_jobs; job_nr++)
 		thread_list[job_nr].join();
-
 }
 
 void inner_sparse_only_max_nnz_col(
diff --git a/sparse_dot_topn/sparse_dot_topn_threaded.pyx b/sparse_dot_topn/sparse_dot_topn_threaded.pyx
index 2f858444..84999abc 100644
--- a/sparse_dot_topn/sparse_dot_topn_threaded.pyx
+++ b/sparse_dot_topn/sparse_dot_topn_threaded.pyx
@@ -32,85 +32,85 @@ np.import_array()
 cdef extern from "sparse_dot_topn_parallel.h":
 
 	cdef void sparse_dot_topn_parallel(
-										int n_row,
-										int n_col,
-										int Ap[],
-										int Aj[],
-										double Ax[],
-										int Bp[],
-										int Bj[],
-										double Bx[],
-										int topn,
-										double lower_bound,
-										int Cp[],
-										int Cj[],
-										double Cx[],
-										int n_jobs
-									);
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		double Ax[],
+		int Bp[],
+		int Bj[],
+		double Bx[],
+		int topn,
+		double lower_bound,
+		int Cp[],
+		int Cj[],
+		double Cx[],
+		int n_jobs
+	);
 
 	cdef void sparse_dot_topn_extd_parallel(
-												int n_row,
-												int n_col,
-												int Ap[],
-												int Aj[],
-												double Ax[],
-												int Bp[],
-												int Bj[],
-												double Bx[],
-												int topn,
-												double lower_bound,
-												int Cp[],
-												int Cj[],
-												double Cx[],
-												int* n_minmax,
-												int n_jobs
-											);
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		double Ax[],
+		int Bp[],
+		int Bj[],
+		double Bx[],
+		int topn,
+		double lower_bound,
+		int Cp[],
+		int Cj[],
+		double Cx[],
+		int* n_minmax,
+		int n_jobs
+	);
 
 	cdef void sparse_dot_free_parallel(
-										int n_row,
-										int n_col,
-										int Ap[],
-										int Aj[],
-										double Ax[],
-										int Bp[],
-										int Bj[],
-										double Bx[],
-										int ntop,
-										double lower_bound,
-										int Cp[],
-										vector[int]* Cj,
-										vector[double]* Cx,
-										int* n_minmax,
-										int n_jobs
-									);
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		double Ax[],
+		int Bp[],
+		int Bj[],
+		double Bx[],
+		int ntop,
+		double lower_bound,
+		int Cp[],
+		vector[int]* Cj,
+		vector[double]* Cx,
+		int* n_minmax,
+		int n_jobs
+	);
 
 	cdef void sparse_dot_only_max_nnz_col_parallel(
-													int n_row,
-													int n_col,
-													int Ap[],
-													int Aj[],
-													int Bp[],
-													int Bj[],
-													int* max_nnz_col,
-													int n_jobs
-												);
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		int Bp[],
+		int Bj[],
+		int* max_nnz_col,
+		int n_jobs
+	);
 
 cpdef sparse_dot_topn_threaded(
-								int n_row,
-								int n_col,
-								np.ndarray[int, ndim=1] a_indptr,
-								np.ndarray[int, ndim=1] a_indices,
-								np.ndarray[double, ndim=1] a_data,
-								np.ndarray[int, ndim=1] b_indptr,
-								np.ndarray[int, ndim=1] b_indices,
-								np.ndarray[double, ndim=1] b_data,
-								int ntop,
-								double lower_bound,
-								np.ndarray[int, ndim=1] c_indptr,
-								np.ndarray[int, ndim=1] c_indices,
-								np.ndarray[double, ndim=1] c_data,
-								int n_jobs
-							):
+	int n_row,
+	int n_col,
+	np.ndarray[int, ndim=1] a_indptr,
+	np.ndarray[int, ndim=1] a_indices,
+	np.ndarray[double, ndim=1] a_data,
+	np.ndarray[int, ndim=1] b_indptr,
+	np.ndarray[int, ndim=1] b_indices,
+	np.ndarray[double, ndim=1] b_data,
+	int ntop,
+	double lower_bound,
+	np.ndarray[int, ndim=1] c_indptr,
+	np.ndarray[int, ndim=1] c_indices,
+	np.ndarray[double, ndim=1] c_data,
+	int n_jobs
+):
 
 	cdef int* Ap = &a_indptr[0]
 	cdef int* Aj = &a_indices[0]
@@ -122,27 +122,28 @@ cpdef sparse_dot_topn_threaded(
 	cdef int* Cj = &c_indices[0]
 	cdef double* Cx = &c_data[0]
 
-	sparse_dot_topn_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop,
-							 lower_bound, Cp, Cj, Cx, n_jobs)
+	sparse_dot_topn_parallel(
+		n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_jobs
+	)
 	return
 
 cpdef sparse_dot_topn_extd_threaded(
-								int n_row,
-								int n_col,
-								np.ndarray[int, ndim=1] a_indptr,
-								np.ndarray[int, ndim=1] a_indices,
-								np.ndarray[double, ndim=1] a_data,
-								np.ndarray[int, ndim=1] b_indptr,
-								np.ndarray[int, ndim=1] b_indices,
-								np.ndarray[double, ndim=1] b_data,
-								int ntop,
-								double lower_bound,
-								np.ndarray[int, ndim=1] c_indptr,
-								np.ndarray[int, ndim=1] c_indices,
-								np.ndarray[double, ndim=1] c_data,
-								np.ndarray[int, ndim=1] nminmax,
-								int n_jobs
-							):
+	int n_row,
+	int n_col,
+	np.ndarray[int, ndim=1] a_indptr,
+	np.ndarray[int, ndim=1] a_indices,
+	np.ndarray[double, ndim=1] a_data,
+	np.ndarray[int, ndim=1] b_indptr,
+	np.ndarray[int, ndim=1] b_indices,
+	np.ndarray[double, ndim=1] b_data,
+	int ntop,
+	double lower_bound,
+	np.ndarray[int, ndim=1] c_indptr,
+	np.ndarray[int, ndim=1] c_indices,
+	np.ndarray[double, ndim=1] c_data,
+	np.ndarray[int, ndim=1] nminmax,
+	int n_jobs
+):
 
 	cdef int* Ap = &a_indptr[0]
 	cdef int* Aj = &a_indices[0]
@@ -155,24 +156,25 @@ cpdef sparse_dot_topn_extd_threaded(
 	cdef double* Cx = &c_data[0]
 	cdef int* n_minmax = &nminmax[0]
 
-	sparse_dot_topn_extd_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop,
-							 lower_bound, Cp, Cj, Cx, n_minmax, n_jobs)
+	sparse_dot_topn_extd_parallel(
+		n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax, n_jobs
+	)
 	return
 
 cpdef sparse_dot_free_threaded(
-								int n_row,
-								int n_col,
-								np.ndarray[int, ndim=1] a_indptr,
-								np.ndarray[int, ndim=1] a_indices,
-								np.ndarray[double, ndim=1] a_data,
-								np.ndarray[int, ndim=1] b_indptr,
-								np.ndarray[int, ndim=1] b_indices,
-								np.ndarray[double, ndim=1] b_data,
-								int ntop,
-								double lower_bound,
-								np.ndarray[int, ndim=1] c_indptr,
-								int n_jobs
-							):
+	int n_row,
+	int n_col,
+	np.ndarray[int, ndim=1] a_indptr,
+	np.ndarray[int, ndim=1] a_indices,
+	np.ndarray[double, ndim=1] a_data,
+	np.ndarray[int, ndim=1] b_indptr,
+	np.ndarray[int, ndim=1] b_indices,
+	np.ndarray[double, ndim=1] b_data,
+	int ntop,
+	double lower_bound,
+	np.ndarray[int, ndim=1] c_indptr,
+	int n_jobs
+):
 
 	cdef int* Ap = &a_indptr[0]
 	cdef int* Aj = &a_indices[0]
@@ -187,7 +189,9 @@ cpdef sparse_dot_free_threaded(
 	cdef vector[int] vCj;
 	cdef vector[double] vCx;
 
-	sparse_dot_free_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, &vCj, &vCx, n_minmax, n_jobs)
+	sparse_dot_free_parallel(
+		n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, &vCj, &vCx, n_minmax, n_jobs
+	)
 	
 	c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0)
 	c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0)
@@ -195,15 +199,15 @@ cpdef sparse_dot_free_threaded(
 	return c_indices, c_data, nminmax[0]
 
 cpdef sparse_dot_only_max_nnz_col_threaded(
-											int n_row,
-											int n_col,
-											np.ndarray[int, ndim=1] a_indptr,
-											np.ndarray[int, ndim=1] a_indices,
-											np.ndarray[int, ndim=1] b_indptr,
-											np.ndarray[int, ndim=1] b_indices,
-											np.ndarray[int, ndim=1] max_nnz_col,
-											int n_jobs
-										):
+	int n_row,
+	int n_col,
+	np.ndarray[int, ndim=1] a_indptr,
+	np.ndarray[int, ndim=1] a_indices,
+	np.ndarray[int, ndim=1] b_indptr,
+	np.ndarray[int, ndim=1] b_indices,
+	np.ndarray[int, ndim=1] max_nnz_col,
+	int n_jobs
+):
 
 	cdef int* Ap = &a_indptr[0]
 	cdef int* Aj = &a_indices[0]

From 2cf60a0106fb6496e3d540066e119502156f9aae Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Wed, 28 Apr 2021 11:42:24 +0200
Subject: [PATCH 11/29] made README.md "pypi.org-friendly"

---
 README.md | 142 +++++++++++++++++++++++++++---------------------------
 1 file changed, 71 insertions(+), 71 deletions(-)

diff --git a/README.md b/README.md
index 7faa5239..6d391ead 100644
--- a/README.md
+++ b/README.md
@@ -7,29 +7,29 @@
 <!-- [![github](https://shields.io/github/v/release/Bergvca/string_grouper)](https://github.com/Bergvca/string_grouper) -->
 
 <details>
-<summary>:information_source: Click to see image</summary>
+<summary>Click to see image</summary>
 <br>
-<center><img width="100%" src="tutorials/sec__edgar_company_info_group003c.svg"></center>
+<center><img width="100%" src="https://raw.githubusercontent.com/Bergvca/string_grouper/master/tutorials/sec__edgar_company_info_group003c.svg"></center>
 
-The image displayed above is a visualization of the graph-structure of one of the groups of strings found by <samp>string_grouper</samp>.  Each circle (node) represents a string, and each connecting arc (edge) represents a match between a pair of strings with a similarity score above a given threshold score (here <samp>0.8</samp>).  
+The image displayed above is a visualization of the graph-structure of one of the groups of strings found by `string_grouper`.  Each circle (node) represents a string, and each connecting arc (edge) represents a match between a pair of strings with a similarity score above a given threshold score (here `0.8`).  
 
-The ***centroid*** of the group, as determined by <samp>string_grouper</samp> (see [tutorials/group_representatives.md](tutorials/group_representatives.md) for an explanation), is the largest node, also with the most edges originating from it.  A thick line in the image denotes a strong similarity between the nodes at its ends, while a faint thin line denotes weak similarity.
+The ***centroid*** of the group, as determined by `string_grouper` (see [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for an explanation), is the largest node, also with the most edges originating from it.  A thick line in the image denotes a strong similarity between the nodes at its ends, while a faint thin line denotes weak similarity.
 
-The power of <samp>string_grouper</samp> is discernible from this image: in large datasets, <samp>string_grouper</samp> is often able to resolve indirect associations between strings even when, say, due to memory-resource-limitations, direct matches between those strings cannot be computed using conventional methods with a lower threshold similarity score.    
+The power of `string_grouper` is discernible from this image: in large datasets, `string_grouper` is often able to resolve indirect associations between strings even when, say, due to memory-resource-limitations, direct matches between those strings cannot be computed using conventional methods with a lower threshold similarity score.    
 
 <div style="text-align: center"> &mdash;&mdash;&mdash;</div>
 
-<sup>This image was designed using the graph-visualization software Gephi 0.9.2 with data generated by <samp>string_grouper</samp> operating on the [sec__edgar_company_info.csv](https://www.kaggle.com/dattapiy/sec-edgar-companies-list/version/1) sample data file.</sup>
+<sup>This image was designed using the graph-visualization software Gephi 0.9.2 with data generated by `string_grouper` operating on the [sec__edgar_company_info.csv](https://www.kaggle.com/dattapiy/sec-edgar-companies-list/version/1) sample data file.</sup>
 
 ---
 </details>
 
 
-**<samp>string_grouper</samp>** is a library that makes finding groups of similar strings within a single, or multiple, lists of strings easy — and fast. **<samp>string_grouper</samp>** uses **tf-idf** to calculate [**cosine similarities**](https://towardsdatascience.com/understanding-cosine-similarity-and-its-application-fd42f585296a) within a single list or between two lists of strings. The full process is described in the blog [Super Fast String Matching in Python](https://bergvca.github.io/2017/10/14/super-fast-string-matching.html).
+**`string_grouper`** is a library that makes finding groups of similar strings within a single, or multiple, lists of strings easy — and fast. **`string_grouper`** uses **tf-idf** to calculate [**cosine similarities**](https://towardsdatascience.com/understanding-cosine-similarity-and-its-application-fd42f585296a) within a single list or between two lists of strings. The full process is described in the blog [Super Fast String Matching in Python](https://bergvca.github.io/2017/10/14/super-fast-string-matching.html).
 
 ## Installing
 
-<samp>pip install string-grouper</samp>
+`pip install string-grouper`
 
 ## Usage
 
@@ -40,91 +40,91 @@ from string_grouper import match_strings, match_most_similar, \
 	StringGrouper
 ```
 
-As shown above, the library may be used together with <samp>pandas</samp>, and contains four high level functions (<samp>match_strings</samp>, <samp>match_most_similar</samp>, <samp>group_similar_strings</samp>, and <samp>compute_pairwise_similarities</samp>) that can be used directly, and one class (<samp>StringGrouper</samp>) that allows for a more interactive approach. 
+As shown above, the library may be used together with `pandas`, and contains four high level functions (`match_strings`, `match_most_similar`, `group_similar_strings`, and `compute_pairwise_similarities`) that can be used directly, and one class (`StringGrouper`) that allows for a more interactive approach. 
 
 The permitted calling patterns of the four functions, and their return types, are:
 
-| Function        | Parameters | <samp>pandas</samp> Return Type |
+| Function        | Parameters | `pandas` Return Type |
 | -------------: |:-------------|:-----:|
-| <samp>match_strings</samp>| <samp>(master, **kwargs)</samp>| <samp>DataFrame</samp> |
-| <samp>match_strings</samp>| <samp>(master, duplicates, **kwargs)</samp>| <samp>DataFrame</samp> |
-| <samp>match_strings</samp>| <samp>(master, master_id=id_series, **kwargs)</samp>| <samp>DataFrame</samp> |
-| <samp>match_strings</samp>| <samp>(master, duplicates, master_id, duplicates_id, **kwargs)</samp>| <samp>DataFrame</samp> |
-| <samp>match_most_similar</samp>| <samp>(master, duplicates, **kwargs)</samp>| <samp>Series</samp> (if kwarg `ignore_index=True`) otherwise <samp>DataFrame</samp> (default)|
-| <samp>match_most_similar</samp>| <samp>(master, duplicates, master_id, duplicates_id, **kwargs)</samp>| <samp>DataFrame</samp> |
-| <samp>group_similar_strings</samp>| <samp>(strings_to_group, **kwargs)</samp>| <samp>Series</samp> (if kwarg `ignore_index=True`) otherwise <samp>DataFrame</samp> (default)|
-| <samp>group_similar_strings</samp>| <samp>(strings_to_group, strings_id, **kwargs)</samp>| <samp>DataFrame</samp> |
-| <samp>compute_pairwise_similarities</samp>| <samp>(string_series_1, string_series_2, **kwargs)</samp>| <samp>Series</samp> |
-
-In the rest of this document the names, <samp>Series</samp> and <samp>DataFrame</samp>, refer to the familiar <samp>pandas</samp> object types.
+| `match_strings`| `(master, **kwargs)`| `DataFrame` |
+| `match_strings`| `(master, duplicates, **kwargs)`| `DataFrame` |
+| `match_strings`| `(master, master_id=id_series, **kwargs)`| `DataFrame` |
+| `match_strings`| `(master, duplicates, master_id, duplicates_id, **kwargs)`| `DataFrame` |
+| `match_most_similar`| `(master, duplicates, **kwargs)`| `Series` (if kwarg `ignore_index=True`) otherwise `DataFrame` (default)|
+| `match_most_similar`| `(master, duplicates, master_id, duplicates_id, **kwargs)`| `DataFrame` |
+| `group_similar_strings`| `(strings_to_group, **kwargs)`| `Series` (if kwarg `ignore_index=True`) otherwise `DataFrame` (default)|
+| `group_similar_strings`| `(strings_to_group, strings_id, **kwargs)`| `DataFrame` |
+| `compute_pairwise_similarities`| `(string_series_1, string_series_2, **kwargs)`| `Series` |
+
+In the rest of this document the names, `Series` and `DataFrame`, refer to the familiar `pandas` object types.
 #### Parameters:
 
 |Name | Description |
 |:--- | :--- |
-|**<samp>master</samp>** | A <samp>Series</samp> of strings to be matched with themselves (or with those in <samp>duplicates</samp>). |
-|**<samp>duplicates</samp>** | A <samp>Series</samp> of strings to be matched with those of <samp>master</samp>. |
-|**<samp>master_id</samp>** (or <samp>id_series</samp>) | A <samp>Series</samp> of IDs corresponding to the strings in <samp>master</samp>. |
-|**<samp>duplicates_id</samp>** | A <samp>Series</samp> of IDs corresponding to the strings in <samp>duplicates</samp>. |
-|**<samp>strings_to_group</samp>** | A <samp>Series</samp> of strings to be grouped. |
-|**<samp>strings_id</samp>** | A <samp>Series</samp> of IDs corresponding to the strings in <samp>strings_to_group</samp>. |
-|**<samp>string_series_1(_2)</samp>** | A <samp>Series</samp> of strings each of which is to be compared with its corresponding string in <samp>string_series_2(_1)</samp>. |
-|**<samp>**kwargs</samp>** | Keyword arguments (see [below](#kwargs)).|
+|**`master`** | A `Series` of strings to be matched with themselves (or with those in `duplicates`). |
+|**`duplicates`** | A `Series` of strings to be matched with those of `master`. |
+|**`master_id`** (or `id_series`) | A `Series` of IDs corresponding to the strings in `master`. |
+|**`duplicates_id`** | A `Series` of IDs corresponding to the strings in `duplicates`. |
+|**`strings_to_group`** | A `Series` of strings to be grouped. |
+|**`strings_id`** | A `Series` of IDs corresponding to the strings in `strings_to_group`. |
+|**`string_series_1(_2)`** | A `Series` of strings each of which is to be compared with its corresponding string in `string_series_2(_1)`. |
+|**`**kwargs`** | Keyword arguments (see [below](#kwargs)).|
 
 #### Functions:
 
 * #### `match_strings` 
-   Returns a <samp>DataFrame</samp> containing similarity-scores of all matching pairs of highly similar strings from <samp>master</samp> (and <samp>duplicates</samp> if given).  Each matching pair in the output appears in its own row/record consisting of
+   Returns a `DataFrame` containing similarity-scores of all matching pairs of highly similar strings from `master` (and `duplicates` if given).  Each matching pair in the output appears in its own row/record consisting of
    
-   1. its "left" part: a string (with/without its index-label) from <samp>master</samp>, 
+   1. its "left" part: a string (with/without its index-label) from `master`, 
    2. its similarity score, and  
-   3. its "right" part: a string (with/without its index-label) from <samp>duplicates</samp> (or <samp>master</samp> if <samp>duplicates</samp> is not given), 
+   3. its "right" part: a string (with/without its index-label) from `duplicates` (or `master` if `duplicates` is not given), 
    
    in that order.  Thus the column-names of the output are a collection of three groups:
    
-   1. The name of <samp>master</samp> and the name(s) of its index (or index-levels) all prefixed by the string `'left_'`,
+   1. The name of `master` and the name(s) of its index (or index-levels) all prefixed by the string `'left_'`,
    2. `'similarity'` whose column has the similarity-scores as values, and 
-   3. The name of <samp>duplicates</samp> (or <samp>master</samp> if <samp>duplicates</samp> is not given) and the name(s) of its index (or index-levels) prefixed by the string `'right_'`.
+   3. The name of `duplicates` (or `master` if `duplicates` is not given) and the name(s) of its index (or index-levels) prefixed by the string `'right_'`.
    
-   Indexes (or their levels) only appear when the keyword argument `ignore_index=False` (the default). (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md) for a demonstration.)
+   Indexes (or their levels) only appear when the keyword argument `ignore_index=False` (the default). (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.)
    
-   If either <samp>master</samp> or <samp>duplicates</samp> has no name, it assumes the name `'side'` which is then prefixed as described above.  Similarly, if any of the indexes (or index-levels) has no name it assumes its <samp>pandas</samp> default name (`'index'`, `'level_0'`, and so on) and is then prefixed as described above.
+   If either `master` or `duplicates` has no name, it assumes the name `'side'` which is then prefixed as described above.  Similarly, if any of the indexes (or index-levels) has no name it assumes its `pandas` default name (`'index'`, `'level_0'`, and so on) and is then prefixed as described above.
    
-   In other words, if only parameter <samp>master</samp> is given, the function will return pairs of highly similar strings within <samp>master</samp>.  This can be seen as a self-join where both <samp>'left_'</samp> and <samp>'right_'</samp> prefixed columns come from <samp>master</samp>. If both parameters <samp>master</samp> and <samp>duplicates</samp> are given, it will return pairs of highly similar strings between <samp>master</samp> and <samp>duplicates</samp>. This can be seen as an inner-join where <samp>'left_'</samp> and <samp>'right_'</samp> prefixed columns come from <samp>master</samp> and <samp>duplicates</samp> respectively.     
+   In other words, if only parameter `master` is given, the function will return pairs of highly similar strings within `master`.  This can be seen as a self-join where both `'left_'` and `'right_'` prefixed columns come from `master`. If both parameters `master` and `duplicates` are given, it will return pairs of highly similar strings between `master` and `duplicates`. This can be seen as an inner-join where `'left_'` and `'right_'` prefixed columns come from `master` and `duplicates` respectively.     
    
-   The function also supports optionally inputting IDs (<samp>master_id</samp> and <samp>duplicates_id</samp>) corresponding to the strings being matched.  In which case, the output includes two additional columns whose names are the names of these optional <samp>Series</samp> prefixed by <samp>'left_'</samp> and <samp>'right_'</samp> accordingly, and containing the IDs corresponding to the strings in the output.  If any of these <samp>Series</samp> has no name, then it assumes the name `'id'` and is then prefixed as described above.
+   The function also supports optionally inputting IDs (`master_id` and `duplicates_id`) corresponding to the strings being matched.  In which case, the output includes two additional columns whose names are the names of these optional `Series` prefixed by `'left_'` and `'right_'` accordingly, and containing the IDs corresponding to the strings in the output.  If any of these `Series` has no name, then it assumes the name `'id'` and is then prefixed as described above.
    
    
 * #### `match_most_similar` 
-   If `ignore_index=True`, returns a <samp>Series</samp> of strings, where for each string in <samp>duplicates</samp> the most similar string in <samp>master</samp> is returned.  If there are no similar strings in <samp>master</samp> for a given string in <samp>duplicates</samp> (because there is no potential match where the cosine similarity is above the threshold \[default: 0.8\]) then the original string in <samp>duplicates</samp> is returned.  The output <samp>Series</samp> thus has the same length and index as <samp>duplicates</samp>.  
+   If `ignore_index=True`, returns a `Series` of strings, where for each string in `duplicates` the most similar string in `master` is returned.  If there are no similar strings in `master` for a given string in `duplicates` (because there is no potential match where the cosine similarity is above the threshold \[default: 0.8\]) then the original string in `duplicates` is returned.  The output `Series` thus has the same length and index as `duplicates`.  
    
-   For example, if an input <samp>Series</samp> with the values <samp>\['foooo', 'bar', 'baz'\]</samp> is passed as the argument <samp>master</samp>, and <samp>\['foooob', 'bar', 'new'\]</samp> as the values of the argument <samp>duplicates</samp>, the function will return a <samp>Series</samp> with values: <samp>\['foooo', 'bar', 'new'\]</samp>.
+   For example, if an input `Series` with the values `\['foooo', 'bar', 'baz'\]` is passed as the argument `master`, and `\['foooob', 'bar', 'new'\]` as the values of the argument `duplicates`, the function will return a `Series` with values: `\['foooo', 'bar', 'new'\]`.
    
-   The name of the output <samp>Series</samp> is the same as that of <samp>master</samp> prefixed with the string `'most_similar_'`.  If <samp>master</samp> has no name, it is assumed to have the name `'master'` before being prefixed.
+   The name of the output `Series` is the same as that of `master` prefixed with the string `'most_similar_'`.  If `master` has no name, it is assumed to have the name `'master'` before being prefixed.
        
-   If `ignore_index=False` (the default), `match_most_similar` returns a <samp>DataFrame</samp> containing the same <samp>Series</samp> described above as one of its columns.  So it inherits the same index and length as <samp>duplicates</samp>.  The rest of its columns correspond to the index (or index-levels) of <samp>master</samp> and thus contain the index-labels of the most similar strings being output as values.  If there are no similar strings in <samp>master</samp> for a given string in <samp>duplicates</samp> then the value(s) assigned to this index-column(s) for that string is `NaN` by default.  However, if the keyword argument `replace_na=True`, then these `NaN` values are replaced with the index-label(s) of that string in <samp>duplicates</samp>.  Note that such replacements can only occur if the indexes of <samp>master</samp> and <samp>duplicates</samp> have the same number of levels.  (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md#MMS) for a demonstration.)
+   If `ignore_index=False` (the default), `match_most_similar` returns a `DataFrame` containing the same `Series` described above as one of its columns.  So it inherits the same index and length as `duplicates`.  The rest of its columns correspond to the index (or index-levels) of `master` and thus contain the index-labels of the most similar strings being output as values.  If there are no similar strings in `master` for a given string in `duplicates` then the value(s) assigned to this index-column(s) for that string is `NaN` by default.  However, if the keyword argument `replace_na=True`, then these `NaN` values are replaced with the index-label(s) of that string in `duplicates`.  Note that such replacements can only occur if the indexes of `master` and `duplicates` have the same number of levels.  (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md#MMS) for a demonstration.)
    
-   Each column-name of the output <samp>DataFrame</samp> has the same name as its corresponding column, index, or index-level of <samp>master</samp> prefixed with the string `'most_similar_'`.
+   Each column-name of the output `DataFrame` has the same name as its corresponding column, index, or index-level of `master` prefixed with the string `'most_similar_'`.
   
-   If both parameters <samp>master_id</samp> and <samp>duplicates_id</samp> are also given, then a <samp>DataFrame</samp> is always returned with the same column(s) as described above, but with an additional column containing those IDs from these input <samp>Series</samp> corresponding to the output strings.  This column's name is the same as that of <samp>master_id</samp> prefixed in the same way as described above.  If <samp>master_id</samp> has no name, it is assumed to have the name `'master_id'` before being prefixed.
+   If both parameters `master_id` and `duplicates_id` are also given, then a `DataFrame` is always returned with the same column(s) as described above, but with an additional column containing those IDs from these input `Series` corresponding to the output strings.  This column's name is the same as that of `master_id` prefixed in the same way as described above.  If `master_id` has no name, it is assumed to have the name `'master_id'` before being prefixed.
 
 
 * #### `group_similar_strings` 
-  Takes a single <samp>Series</samp> of strings (<samp>strings_to_group</samp>) and groups them by assigning to each string one string from <samp>strings_to_group</samp> chosen as the group-representative for each group of similar strings found. (See [tutorials/group_representatives.md](tutorials/group_representatives.md) for details on how the the group-representatives are chosen.)   
+  Takes a single `Series` of strings (`strings_to_group`) and groups them by assigning to each string one string from `strings_to_group` chosen as the group-representative for each group of similar strings found. (See [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for details on how the the group-representatives are chosen.)   
   
-  If `ignore_index=True`, the output is a <samp>Series</samp> (with the same name as <samp>strings_to_group</samp> prefixed by the string `'group_rep_'`) of the same length and index as <samp>strings_to_group</samp> containing the group-representative strings.  If <samp>strings_to_group</samp> has no name then the name of the returned <samp>Series</samp> is `'group_rep'`.  
+  If `ignore_index=True`, the output is a `Series` (with the same name as `strings_to_group` prefixed by the string `'group_rep_'`) of the same length and index as `strings_to_group` containing the group-representative strings.  If `strings_to_group` has no name then the name of the returned `Series` is `'group_rep'`.  
    
-  For example, an input Series with values: <samp>\['foooo', 'foooob', 'bar'\]</samp> will return <samp>\['foooo', 'foooo', 'bar'\]</samp>.  Here <samp>'foooo'</samp> and <samp>'foooob'</samp> are grouped together into group <samp>'foooo'</samp> because they are found to be similar.  Another example can be found [below](#dedup).
+  For example, an input Series with values: `\['foooo', 'foooob', 'bar'\]` will return `\['foooo', 'foooo', 'bar'\]`.  Here `'foooo'` and `'foooob'` are grouped together into group `'foooo'` because they are found to be similar.  Another example can be found [below](#dedup).
   
-   If `ignore_index=False`, the output is a <samp>DataFrame</samp> containing the above output <samp>Series</samp> as one of its columns with the same name.  The remaining column(s) correspond to the index (or index-levels) of <samp>strings_to_group</samp> and contain the index-labels of the group-representatives as values.  These columns have the same names as their counterparts prefixed by the string `'group_rep_'`. 
+   If `ignore_index=False`, the output is a `DataFrame` containing the above output `Series` as one of its columns with the same name.  The remaining column(s) correspond to the index (or index-levels) of `strings_to_group` and contain the index-labels of the group-representatives as values.  These columns have the same names as their counterparts prefixed by the string `'group_rep_'`. 
    
-   If <samp>strings_id</samp> is also given, then the IDs from <samp>strings_id</samp> corresponding to the group-representatives are also returned in an additional column (with the same name as <samp>strings_id</samp> prefixed as described above).  If <samp>strings_id</samp> has no name, it is assumed to have the name `'id'` before being prefixed.
+   If `strings_id` is also given, then the IDs from `strings_id` corresponding to the group-representatives are also returned in an additional column (with the same name as `strings_id` prefixed as described above).  If `strings_id` has no name, it is assumed to have the name `'id'` before being prefixed.
    
 
 * #### `compute_pairwise_similarities`
-   Returns a <samp>Series</samp> of cosine similarity scores the same length and index as <samp>string_series_1</samp>.  Each score is the cosine similarity between the pair of strings in the same position (row) in the two input <samp>Series</samp>, <samp>string_series_1</samp> and <samp>string_series_2</samp>, as the position of the score in the output <samp>Series</samp>.  This can be seen as an element-wise comparison between the two input <samp>Series</samp>.
+   Returns a `Series` of cosine similarity scores the same length and index as `string_series_1`.  Each score is the cosine similarity between the pair of strings in the same position (row) in the two input `Series`, `string_series_1` and `string_series_2`, as the position of the score in the output `Series`.  This can be seen as an element-wise comparison between the two input `Series`.
    
 
-All functions are built using a class **<samp>StringGrouper</samp>**. This class can be used through pre-defined functions, for example the four high level functions above, as well as using a more interactive approach where matches can be added or removed if needed by calling the **<samp>StringGrouper</samp>** class directly.
+All functions are built using a class **`StringGrouper`**. This class can be used through pre-defined functions, for example the four high level functions above, as well as using a more interactive approach where matches can be added or removed if needed by calling the **`StringGrouper`** class directly.
    
 
 #### Options:
@@ -133,17 +133,17 @@ All functions are built using a class **<samp>StringGrouper</samp>**. This class
 
    All keyword arguments not mentioned in the function definitions above are used to update the default settings. The following optional arguments can be used:
 
-   * **<samp>ngram_size</samp>**: The amount of characters in each n-gram. Default is <samp>3</samp>.
-   * **<samp>regex</samp>**: The regex string used to clean-up the input string. Default is <samp>"[,-./]|\s"</samp>.
-   * **<samp>max_n_matches</samp>**: The maximum number of matches allowed per string in <samp>master</samp>. Default is <samp>20</samp>.
-   * **<samp>min_similarity</samp>**: The minimum cosine similarity for two strings to be considered a match.
-    Defaults to <samp>0.8</samp>
-   * **<samp>number_of_processes</samp>**: The number of processes used by the cosine similarity calculation. Defaults to
+   * **`ngram_size`**: The amount of characters in each n-gram. Default is `3`.
+   * **`regex`**: The regex string used to clean-up the input string. Default is `"[,-./]|\s"`.
+   * **`max_n_matches`**: The maximum number of matches allowed per string in `master`. Default is the number of strings in `duplicates` (or `master`, if `duplicates` is not given).
+   * **`min_similarity`**: The minimum cosine similarity for two strings to be considered a match.
+    Defaults to `0.8`
+   * **`number_of_processes`**: The number of processes used by the cosine similarity calculation. Defaults to
     `number of cores on a machine - 1.`
-   * **<samp>ignore_index</samp>**: Determines whether indexes are ignored or not.  If `False` (the default), index-columns will appear in the output, otherwise not.  (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md) for a demonstration.)
-   * **<samp>replace_na</samp>**: For function <samp>match_most_similar</samp>, determines whether `NaN` values in index-columns are replaced or not by index-labels from <samp>duplicates</samp>. Defaults to `False`.  (See [tutorials/ignore_index_and_replace_na.md](tutorials/ignore_index_and_replace_na.md) for a demonstration.)
-   * **<samp>include_zeroes</samp>**: When <samp>min_similarity</samp> &le; 0, determines whether zero-similarity matches appear in the output.  Defaults to `True`.  (See [tutorials/zero_similarity.md](tutorials/zero_similarity.md).)  **Note:** If <samp>include_zeroes</samp> is `True` and the kwarg <samp>max_n_matches</samp> is set then it must be sufficiently high to capture ***all*** nonzero-similarity-matches, otherwise an error is raised and <samp>string_grouper</samp> suggests an alternative value for <samp>max_n_matches</samp>.  To allow <samp>string_grouper</samp> to automatically use the appropriate value for <samp>max_n_matches</samp> then do not set this kwarg at all.
-   * **<samp>group_rep</samp>**: For function <samp>group_similar_strings</samp>, determines how group-representatives are chosen.  Allowed values are `'centroid'` (the default) and `'first'`.  See [tutorials/group_representatives.md](tutorials/group_representatives.md) for an explanation.
+   * **`ignore_index`**: Determines whether indexes are ignored or not.  If `False` (the default), index-columns will appear in the output, otherwise not.  (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.)
+   * **`replace_na`**: For function `match_most_similar`, determines whether `NaN` values in index-columns are replaced or not by index-labels from `duplicates`. Defaults to `False`.  (See [tutorials/ignore_index_and_replace_na.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/ignore_index_and_replace_na.md) for a demonstration.)
+   * **`include_zeroes`**: When `min_similarity` &le; 0, determines whether zero-similarity matches appear in the output.  Defaults to `True`.  (See [tutorials/zero_similarity.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/zero_similarity.md).)  **Note:** If `include_zeroes` is `True` and the kwarg `max_n_matches` is set then it must be sufficiently high to capture ***all*** nonzero-similarity-matches, otherwise an error is raised and `string_grouper` suggests an alternative value for `max_n_matches`.  To allow `string_grouper` to automatically use the appropriate value for `max_n_matches` then do not set this kwarg at all.
+   * **`group_rep`**: For function `group_similar_strings`, determines how group-representatives are chosen.  Allowed values are `'centroid'` (the default) and `'first'`.  See [tutorials/group_representatives.md](https://github.com/Bergvca/string_grouper/blob/master/tutorials/group_representatives.md) for an explanation.
 
 ## Examples
 
@@ -231,7 +231,7 @@ matches[matches['left_Company Name'] != matches['right_Company Name']].head()
 
 
 ### Find all matches in between two data sets. 
-The <samp>match_strings</samp> function finds similar items between two data sets as well. This can be seen as an inner join between two data sets:
+The `match_strings` function finds similar items between two data sets as well. This can be seen as an inner join between two data sets:
 
 
 ```python
@@ -301,11 +301,11 @@ matches
 </div>
 
 
-Out of the four company names in <samp>duplicates</samp>, three companies are found in the original company data set. One company is found three times.
+Out of the four company names in `duplicates`, three companies are found in the original company data set. One company is found three times.
 
 ### Finding duplicates from a (database extract to) DataFrame where IDs for rows are supplied.
 
-A very common scenario is the case where duplicate records for an entity have been entered into a database. That is, there are two or more records where a name field has slightly different spelling. For example, "A.B. Corporation" and "AB Corporation". Using the optional 'ID' parameter in the <samp>match_strings</samp> function duplicates can be found easily. A [tutorial](tutorials/tutorial_1.md) that steps though the process with an example data set is available.
+A very common scenario is the case where duplicate records for an entity have been entered into a database. That is, there are two or more records where a name field has slightly different spelling. For example, "A.B. Corporation" and "AB Corporation". Using the optional 'ID' parameter in the `match_strings` function duplicates can be found easily. A [tutorial](https://github.com/Bergvca/string_grouper/blob/master/tutorials/tutorial_1.md) that steps though the process with an example data set is available.
 
 
 ### For a second data set, find only the most similar match
@@ -362,7 +362,7 @@ pd.concat([new_companies, matches], axis=1)
 
 ### <a name="dedup"></a>Deduplicate a single data set and show items with most duplicates
 
-The <samp>group_similar_strings</samp> function groups strings that are similar using a single linkage clustering algorithm. That is, if item A and item B are similar; and item B and item C are similar; but the similarity between A and C is below the threshold; then all three items are grouped together. 
+The `group_similar_strings` function groups strings that are similar using a single linkage clustering algorithm. That is, if item A and item B are similar; and item B and item C are similar; but the similarity between A and C is below the threshold; then all three items are grouped together. 
 
 ```python
 # Add the grouped strings:
@@ -389,7 +389,7 @@ companies.groupby('deduplicated_name')['Line Number'].count().sort_values(ascend
     Name: Line Number, dtype: int64
 
 
-The <samp>group_similar_strings</samp> function also works with IDs: imagine a <samp>DataFrame</samp> (<samp>customers_df</samp>) with the following content:
+The `group_similar_strings` function also works with IDs: imagine a `DataFrame` (`customers_df`) with the following content:
 ```python
 # Create a small set of artificial customer names:
 customers_df = pd.DataFrame(
@@ -443,7 +443,7 @@ customers_df
 </table>
 </div>
 
-The output of <samp>group_similar_strings</samp> can be directly used as a mapping table:
+The output of `group_similar_strings` can be directly used as a mapping table:
 ```python
 # Group customers with similar names:
 customers_df[["group-id", "name_deduped"]]  = \
@@ -503,11 +503,11 @@ customers_df
 </table>
 </div>
 
-Note that here <samp>customers_df</samp> initially had only one column "Customer Name" (before the <samp>group_similar_strings</samp> function call); and it acquired two more columns "group-id" (the index-column) and "name_deduped" after the call through a "[setting with enlargement](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#setting-with-enlargement)" (a <samp>pandas</samp> feature).
+Note that here `customers_df` initially had only one column "Customer Name" (before the `group_similar_strings` function call); and it acquired two more columns "group-id" (the index-column) and "name_deduped" after the call through a "[setting with enlargement](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#setting-with-enlargement)" (a `pandas` feature).
 
 ### <a name="dot"></a>Simply compute the cosine similarities of pairs of strings
 
-Sometimes we have pairs of strings that have already been matched but whose similarity scores need to be computed.  For this purpose we provide the function <samp>compute_pairwise_similarities</samp>:
+Sometimes we have pairs of strings that have already been matched but whose similarity scores need to be computed.  For this purpose we provide the function `compute_pairwise_similarities`:
 
 ```python
 # Create a small DataFrame of pairs of strings:
@@ -640,14 +640,14 @@ pair_s
 
 ## The StringGrouper class
 
-The four functions mentioned above all create a <samp>StringGrouper</samp> object behind the scenes and call different functions on it. The <samp>StringGrouper</samp> class keeps track of all tuples of similar strings and creates the groups out of these. Since matches are often not perfect, a common workflow is to:
+The four functions mentioned above all create a `StringGrouper` object behind the scenes and call different functions on it. The `StringGrouper` class keeps track of all tuples of similar strings and creates the groups out of these. Since matches are often not perfect, a common workflow is to:
 
 1. Create matches
 2. Manually inspect the results
 3. Add and remove matches where necessary
 4. Create groups of similar strings
 
-The <samp>StringGrouper</samp> class allows for this without having to re-calculate the cosine similarity matrix. See below for an example. 
+The `StringGrouper` class allows for this without having to re-calculate the cosine similarity matrix. See below for an example. 
 
 
 ```python

From c96ec50fe41e5b469fc30177c138389ef5bacab7 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Wed, 28 Apr 2021 12:12:19 +0200
Subject: [PATCH 12/29] rearranged code in string_grouper.py

---
 string_grouper/string_grouper.py | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index d3eb07c6..d4f38387 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -468,22 +468,6 @@ def _get_non_matches_list(self) -> pd.DataFrame:
         missing_pairs['similarity'] = 0
         return missing_pairs
 
-    @staticmethod
-    def _symmetrize_matrix(AA: csr_matrix) -> csr_matrix:
-        A = AA.tolil()
-        r, c = A.nonzero()
-        A[c, r] = A[r, c]
-        return A.tocsr()
-
-    @staticmethod
-    def _get_matches_list(matches: csr_matrix) -> pd.DataFrame:
-        """Returns a list of all the indices of matches"""
-        r, c = matches.nonzero()
-        matches_list = pd.DataFrame({'master_side': r.astype(np.int64),
-                                     'dupe_side': c.astype(np.int64),
-                                     'similarity': matches.data})
-        return matches_list
-
     def _get_nearest_matches(self,
                              ignore_index=False,
                              replace_na=False) -> Union[pd.DataFrame, pd.Series]:
@@ -634,6 +618,21 @@ def _validate_replace_na_and_drop(self):
                 "index if the number of index-levels does not equal the number of index-columns."
             )
 
+    @staticmethod
+    def _symmetrize_matrix(AA: csr_matrix) -> csr_matrix:
+        A = AA.tolil()
+        r, c = A.nonzero()
+        A[c, r] = A[r, c]
+        return A.tocsr()
+
+    @staticmethod
+    def _get_matches_list(matches: csr_matrix) -> pd.DataFrame:
+        """Returns a list of all the indices of matches"""
+        r, c = matches.nonzero()
+        return pd.DataFrame({'master_side': r.astype(np.int64),
+                                     'dupe_side': c.astype(np.int64),
+                                     'similarity': matches.data})
+
     @staticmethod
     def _make_symmetric(new_matches: pd.DataFrame) -> pd.DataFrame:
         columns_switched = pd.DataFrame({'master_side': new_matches.dupe_side,

From 0f0b2c3207a0517f661d49d6ed874ec8dd26407a Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Thu, 29 Apr 2021 07:08:29 +0200
Subject: [PATCH 13/29] corrected optional_kwargs for awesome_cossim_dotn in
 _build_matches()

so that return_best_topn is always True
---
 string_grouper/string_grouper.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index d4f38387..4ea5e380 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -436,13 +436,11 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix
         tf_idf_matrix_1 = master_matrix
         tf_idf_matrix_2 = duplicate_matrix.transpose()
         
-        optional_kwargs = dict()
-        if self._config.number_of_processes > 1:
-            optional_kwargs = {
-                'return_best_topn': True,
-                'use_threads': True,
-                'n_jobs': self._config.number_of_processes
-            }
+        optional_kwargs = {
+            'return_best_topn': True,
+            'use_threads': self._config.number_of_processes > 1,
+            'n_jobs': self._config.number_of_processes
+        }
 
         return awesome_cossim_topn(
             tf_idf_matrix_1, tf_idf_matrix_2,

From 57c4122d3fa3af8882312fde71c939086dfbd5b8 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Thu, 29 Apr 2021 21:51:23 +0200
Subject: [PATCH 14/29] added scouting function that determines the amount of
 memory needed for

the matrix-product results.  Also discarded the entire C/C++ memory
management extension array_wrappers.
---
 setup.py                                      |  11 +-
 sparse_dot_topn/array_wrappers.pxd            |  18 -
 sparse_dot_topn/array_wrappers.pyx            |  73 ---
 sparse_dot_topn/awesome_cossim_topn.py        | 182 ++-----
 sparse_dot_topn/example/comparison2.py        | 296 ++++++-----
 sparse_dot_topn/sparse_dot_topn.pyx           | 491 ++++++++----------
 sparse_dot_topn/sparse_dot_topn_parallel.cpp  | 164 ++----
 sparse_dot_topn/sparse_dot_topn_parallel.h    |  39 +-
 sparse_dot_topn/sparse_dot_topn_source.cpp    | 158 ++----
 sparse_dot_topn/sparse_dot_topn_source.h      |  18 +-
 sparse_dot_topn/sparse_dot_topn_threaded.pyx  |  56 +-
 .../test/test_awesome_cossim_topn.py          |  46 +-
 string_grouper/string_grouper.py              |   2 +-
 13 files changed, 557 insertions(+), 997 deletions(-)
 delete mode 100644 sparse_dot_topn/array_wrappers.pxd
 delete mode 100644 sparse_dot_topn/array_wrappers.pyx

diff --git a/setup.py b/setup.py
index 577ed0d9..c47aa78b 100644
--- a/setup.py
+++ b/setup.py
@@ -29,15 +29,6 @@ def finalize_options(self):
 else:
     extra_compile_args = ['-std=c++0x', '-pthread', '-O3']
 
-array_wrappers_ext = Extension('sparse_dot_topn.array_wrappers',
-                         sources=[
-                                    './sparse_dot_topn/array_wrappers.pyx',
-                                    './sparse_dot_topn/sparse_dot_topn_source.cpp'
-                                ],
-                         extra_compile_args=extra_compile_args,
-                         define_macros=[('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')],
-                         language='c++')
-
 original_ext = Extension('sparse_dot_topn.sparse_dot_topn',
                          sources=[
                                     './sparse_dot_topn/sparse_dot_topn.pyx',
@@ -91,5 +82,5 @@ def finalize_options(self):
                       , 'pandas>=0.25.3'
     ],
     cmdclass={'build_ext': my_build_ext},
-    ext_modules=[array_wrappers_ext, original_ext, threaded_ext]
+    ext_modules=[original_ext, threaded_ext]
 )
diff --git a/sparse_dot_topn/array_wrappers.pxd b/sparse_dot_topn/array_wrappers.pxd
deleted file mode 100644
index d77e41b3..00000000
--- a/sparse_dot_topn/array_wrappers.pxd
+++ /dev/null
@@ -1,18 +0,0 @@
-from libcpp.vector cimport vector
-
-# define a Cython array wrapper class to hold a C++ vector of ints, adhering to numpy's buffer protocol:
-cdef class ArrayWrapper_int:
-	cdef int view_count
-	cdef vector[int] vec
-	cdef Py_ssize_t shape[2]
-	cdef Py_ssize_t strides[2]
-
-
-# define a Cython array wrapper class to hold a C++ vector of doubles, adhering to numpy's buffer protocol:
-cdef class ArrayWrapper_double:
-	cdef int view_count
-	cdef vector[double] vec
-	cdef Py_ssize_t shape[2]
-	cdef Py_ssize_t strides[2]
-
-
diff --git a/sparse_dot_topn/array_wrappers.pyx b/sparse_dot_topn/array_wrappers.pyx
deleted file mode 100644
index ee458629..00000000
--- a/sparse_dot_topn/array_wrappers.pyx
+++ /dev/null
@@ -1,73 +0,0 @@
-from cpython cimport Py_buffer
-from libcpp.vector cimport vector
-
-# define a Cython array wrapper class to hold a C++ vector of ints, adhering to numpy's buffer protocol:
-cdef class ArrayWrapper_int:
-	# constructor and destructor are fairly unimportant now since
-	# vec will be destroyed automatically.
-
-	def __cinit__(self, vector[int]& data):
-		self.vec.swap(data)
-		self.view_count = 0
-
-	# now implement the buffer protocol for the class
-	# which makes it generally useful to anything that expects an array
-	def __getbuffer__(self, Py_buffer *buffer, int flags):
-		# relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class
-		cdef Py_ssize_t itemsize = sizeof(self.vec[0])
-
-		self.shape[1] = self.vec.size()
-		self.shape[0] = 1
-		self.strides[1] = <Py_ssize_t>(  <char *>&(self.vec[1]) - <char *>&(self.vec[0]))
-		self.strides[0] = self.vec.size() * self.strides[1]
-		buffer.buf = <char *>&(self.vec[0])
-		buffer.format = 'i'
-		buffer.internal = NULL
-		buffer.itemsize = itemsize
-		buffer.len = self.vec.size() * itemsize   # product(shape) * itemsize
-		buffer.ndim = 2
-		buffer.obj = self
-		buffer.readonly = 0
-		buffer.shape = self.shape
-		buffer.strides = self.strides
-		buffer.suboffsets = NULL
-		self.view_count += 1
-		
-	def __releasebuffer__(self, Py_buffer *buffer):
-		self.view_count -= 1
-
-
-# define a Cython array wrapper class to hold a C++ vector of doubles, adhering to numpy's buffer protocol:
-cdef class ArrayWrapper_double:
-	# constructor and destructor are fairly unimportant now since
-	# vec will be destroyed automatically.
-
-	def __cinit__(self, vector[double]& data):
-		self.vec.swap(data)
-		self.view_count = 0
-
-	# now implement the buffer protocol for the class
-	# which makes it generally useful to anything that expects an array
-	def __getbuffer__(self, Py_buffer *buffer, int flags):
-		# relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class
-		cdef Py_ssize_t itemsize = sizeof(self.vec[0])
-
-		self.shape[1] = self.vec.size()
-		self.shape[0] = 1
-		self.strides[1] = <Py_ssize_t>(  <char *>&(self.vec[1]) - <char *>&(self.vec[0]))
-		self.strides[0] = self.vec.size() * self.strides[1]
-		buffer.buf = <char *>&(self.vec[0])
-		buffer.format = 'd'
-		buffer.internal = NULL
-		buffer.itemsize = itemsize
-		buffer.len = self.vec.size() * itemsize   # product(shape) * itemsize
-		buffer.ndim = 2
-		buffer.obj = self
-		buffer.readonly = 0
-		buffer.shape = self.shape
-		buffer.strides = self.strides
-		buffer.suboffsets = NULL
-		self.view_count += 1
-		
-	def __releasebuffer__(self, Py_buffer *buffer):
-		self.view_count -= 1
diff --git a/sparse_dot_topn/awesome_cossim_topn.py b/sparse_dot_topn/awesome_cossim_topn.py
index 48caaa57..baa14fbc 100644
--- a/sparse_dot_topn/awesome_cossim_topn.py
+++ b/sparse_dot_topn/awesome_cossim_topn.py
@@ -12,21 +12,13 @@
 
 
 def awesome_cossim_topn(
-        A,
-        B,
-        ntop,
-        lower_bound=0,
-        use_threads=False,
-        n_jobs=1,
-        mem_manager_is_C=False,
-        return_best_topn=False
-    ):
+        A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1, scout_nnz=False, return_best_ntop=False):
     """
     This function will return a matrix C in CSR format, where
     C = [sorted top n results > lower_bound for each row of A * B].
-    If return_best_topn=True then best_topn
+    If return_best_ntop=True then best_ntop
     (the true maximum number of elements > lower_bound per row of A * B)
-    will also be returned in a tuple together with C as (C, best_topn).
+    will also be returned in a tuple together with C as (C, best_ntop).
 
     Input:
         A and B: two CSR matrices
@@ -34,23 +26,22 @@ def awesome_cossim_topn(
         lower_bound: a threshold that the element of A*B must be greater than
         use_threads: use multi-thread or not
         n_jobs: number of thread, must be >= 1
-        mem_manager_is_C: (default: False) this is mainly for testing purposes. if 
-                          True, will force memory management to be handed over to
-                          C/C++.
-        return_best_topn: (default: False) if True, will return best_topn together 
-                          with C as a tuple: (C, best_topn)
+        scout_nnz: (default: False) this is mainly for testing purposes. if 
+                   True, will force a memory-size determination before computing
+                   the results.
+        return_best_ntop: (default: False) if True, will return best_ntop together 
+                          with C as a tuple: (C, best_ntop)
 
     Output:
-        C: result matrix (returned alone, if return_best_topn=False)
-        best_topn: The true maximum number of elements > lower_bound per row of 
-                   A * B returned together with C as a tuple: (C, best_topn). It is 
-                   returned only if return_best_topn=True.
+        C: result matrix (returned alone, if return_best_ntop=False)
+        best_ntop: The true maximum number of elements > lower_bound per row of 
+                   A * B returned together with C as a tuple: (C, best_ntop). It is 
+                   returned only if return_best_ntop=True.
 
     N.B. if A and B are not in CSR format, they will be converted to CSR
     """
     if not isspmatrix_csr(A):
         A = A.tocsr()
-
     if not isspmatrix_csr(B):
         B = B.tocsr()
 
@@ -71,7 +62,7 @@ def awesome_cossim_topn(
         indices = np.zeros(nnz_max, dtype=idx_dtype)
         data = np.zeros(nnz_max, dtype=A.dtype)
         output = csr_matrix((data, indices, indptr), shape=(M, N))
-        if return_best_topn:
+        if return_best_ntop:
             return output, 0
         else:
             return output
@@ -81,146 +72,77 @@ def awesome_cossim_topn(
     try:
         indices = np.empty(nnz_max, dtype=idx_dtype)
         data = np.empty(nnz_max, dtype=A.dtype)
-        if mem_manager_is_C: raise MemoryError    # This is mainly for testing purposes
+        if scout_nnz: raise MemoryError    # This is mainly for testing purposes
     except MemoryError:
-        # if mem_manager_is_C: print('Exception raised! Continuing ...', flush=True)
+        # if scout_nnz: print('Exception raised! Continuing ...', flush=True)
         # It is likely you are here because nnz_max is too large. But don't give up just yet! 
-        # sparse_dot_topn will hand over the memory allocation/management to C++.  C++ will
-        # grow the memory allocations for these arrays as needed without any need for nnz_max.
-        # Note that reallocations could occur causing data to be copied to other locations 
-        # in memory thus impacting performance
-        indices = np.empty(0, dtype=idx_dtype)
-        data = np.empty(0, dtype=A.dtype)
+        # sparse_dot_topn will go ahead and count the exact amount of memory required.
         if not use_threads:
-
-            indices, data, best_topn = ct.sparse_dot_free(
-                M, N, np.asarray(A.indptr, dtype=idx_dtype),
+            
+            nnz = ct.sparse_dot_only_nnz(M, N, np.asarray(A.indptr, dtype=idx_dtype),
                 np.asarray(A.indices, dtype=idx_dtype),
                 A.data,
                 np.asarray(B.indptr, dtype=idx_dtype),
                 np.asarray(B.indices, dtype=idx_dtype),
                 B.data,
-                ntop, lower_bound,
-                indptr
+                ntop, lower_bound
             )
             
         else:
 
-            indices, data, best_topn = ct_thread.sparse_dot_free_threaded(
+            nnz = ct_thread.sparse_dot_only_nnz_threaded(
                 M, N, np.asarray(A.indptr, dtype=idx_dtype),
                 np.asarray(A.indices, dtype=idx_dtype),
                 A.data,
                 np.asarray(B.indptr, dtype=idx_dtype),
                 np.asarray(B.indices, dtype=idx_dtype),
                 B.data,
-                ntop, lower_bound,
-                indptr, n_jobs
+                ntop, lower_bound, n_jobs
             )
-
-    else:
-        # no exception was raised; then use old function (as it is expected to be the fastest)
-        
-        best_topn_arr = np.full(1, 0, dtype=idx_dtype)
-        
-        if not use_threads:
+            
+        nnz = max(1, nnz)
+        indices = np.empty(nnz, dtype=idx_dtype)
+        data = np.empty(nnz, dtype=A.dtype)
         
-            ct.sparse_dot_topn_extd(
-                M, N, np.asarray(A.indptr, dtype=idx_dtype),
-                np.asarray(A.indices, dtype=idx_dtype),
-                A.data,
-                np.asarray(B.indptr, dtype=idx_dtype),
-                np.asarray(B.indices, dtype=idx_dtype),
-                B.data,
-                ntop,
-                lower_bound,
-                indptr, indices, data, best_topn_arr
-            )
+    # no exception was raised; then use old function (as it is expected to be the fastest)
     
-        else:
-            if n_jobs < 1:
-                err_str = 'Whenever you select the multi-thread mode, n_job must be greater than or equal to 1!'
-                raise ValueError(err_str)
+    best_ntop_arr = np.full(1, 0, dtype=idx_dtype)
     
-            ct_thread.sparse_dot_topn_extd_threaded(
-                M, N, np.asarray(A.indptr, dtype=idx_dtype),
-                np.asarray(A.indices, dtype=idx_dtype),
-                A.data,
-                np.asarray(B.indptr, dtype=idx_dtype),
-                np.asarray(B.indices, dtype=idx_dtype),
-                B.data,
-                ntop,
-                lower_bound,
-                indptr, indices, data, best_topn_arr, n_jobs
-            )
-        
-        best_topn = best_topn_arr[0]
-    
-    # prepare and return the output:
-    output = csr_matrix((data, indices, indptr), shape=(M, N))
-    if return_best_topn:
-        return output, best_topn
-    else:
-        return output
-
-
-def awesome_cossim_only_max_nnz_col(A, B, use_threads=False, n_jobs=1):
-    """
-    This function will return the maximum number of columns set
-    per row over all rows of A * B
-
-    Input:
-        A and B: two CSR matrix
-        use_threads: use multi-thread or not
-        n_jobs: number of thread, must be >= 1
-
-    Output:
-        minmax_topn: maximum number of columns set
-                     per row over all rows of A * B
-
-    N.B. if A and B are not CSR format, they will be converted to CSR
-    """
-    if not isspmatrix_csr(A):
-        A = A.tocsr()
-
-    if not isspmatrix_csr(B):
-        B = B.tocsr()
-
-    M, K1 = A.shape
-    K2, N = B.shape
-
-    if K1 != K2:
-        err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!'
-        raise ValueError(err_str)
-
-    idx_dtype = np.int32
-
-    minmax_topn = np.full(1, 0, dtype=idx_dtype)
-
-    # basic check. if A or B are all zeros matrix, return 0 directly
-    if len(A.indices) == 0 or len(B.indices) == 0:
-        return 0
-
     if not use_threads:
-
-        ct.sparse_dot_only_max_nnz_col(
-            M, N,
-            np.asarray(A.indptr, dtype=idx_dtype),
+    
+        ct.sparse_dot_topn_extd(
+            M, N, np.asarray(A.indptr, dtype=idx_dtype),
             np.asarray(A.indices, dtype=idx_dtype),
+            A.data,
             np.asarray(B.indptr, dtype=idx_dtype),
             np.asarray(B.indices, dtype=idx_dtype),
-            minmax_topn)
+            B.data,
+            ntop,
+            lower_bound,
+            indptr, indices, data, best_ntop_arr
+        )
 
     else:
         if n_jobs < 1:
-            err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!'
+            err_str = 'Whenever you select the multi-thread mode, n_job must be greater than or equal to 1!'
             raise ValueError(err_str)
 
-        ct_thread.sparse_dot_only_max_nnz_col_threaded(
-            M, N,
-            np.asarray(A.indptr, dtype=idx_dtype),
+        ct_thread.sparse_dot_topn_extd_threaded(
+            M, N, np.asarray(A.indptr, dtype=idx_dtype),
             np.asarray(A.indices, dtype=idx_dtype),
+            A.data,
             np.asarray(B.indptr, dtype=idx_dtype),
             np.asarray(B.indices, dtype=idx_dtype),
-            minmax_topn, n_jobs)
+            B.data,
+            ntop,
+            lower_bound,
+            indptr, indices, data, best_ntop_arr, n_jobs
+        )
+    
+    # prepare and return the output:
+    output = csr_matrix((data, indices, indptr), shape=(M, N))
+    if return_best_ntop:
+        return output, best_ntop_arr[0]
+    else:
+        return output
 
-    return minmax_topn[0]
diff --git a/sparse_dot_topn/example/comparison2.py b/sparse_dot_topn/example/comparison2.py
index 7af5d08a..c54a2ff8 100644
--- a/sparse_dot_topn/example/comparison2.py
+++ b/sparse_dot_topn/example/comparison2.py
@@ -5,165 +5,177 @@
 from __future__ import print_function
 import timeit
 import numpy as np
+import pandas as pd
 from scipy.sparse import coo_matrix
 from sparse_dot_topn import awesome_cossim_topn  # noqa: F401
 
+df = pd.DataFrame(columns=['sample', '#threads', 'python', '+scout', '%inc'])
+
 N = 1000
 thresh = 0.01
 
-nr_vocab = 2 << 24
-density = 1e-6
+nr_vocab = int(26**3)
+density = 30/nr_vocab
 n_samples = 1000000
 n_duplicates = N
 nnz_a = int(n_samples * nr_vocab * density)
 nnz_b = int(n_duplicates * nr_vocab * density)
 
+print(f'ntop = {N}', flush=True)
+print(f'threshold = {thresh}', flush=True)
 print(f'density = {density}', flush=True)
 print(f'nr_vocab = {nr_vocab}', flush=True)
 print(f'n_samples = {n_samples}', flush=True)
 print(f'n_duplicates = {n_duplicates}', flush=True)
-print(f'nnz_a = {nnz_a}', flush=True)
-print(f'nnz_b = {nnz_b}', flush=True)
+print(f'nnz_A = {nnz_a}', flush=True)
+print(f'nnz_B = {nnz_b}', flush=True)
 print('', flush=True)
 
 rng1 = np.random.RandomState(42)
 rng2 = np.random.RandomState(43)
 
-row = rng1.randint(n_samples, size=nnz_a)
-cols = rng2.randint(nr_vocab, size=nnz_a)
-data = rng1.rand(nnz_a)
-
-a_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab))
-a = a_sparse.tocsr()
-
-row = rng1.randint(n_duplicates, size=nnz_b)
-cols = rng2.randint(nr_vocab, size=nnz_b)
-data = rng1.rand(nnz_b)
-
-b_sparse = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab))
-b = b_sparse.T.tocsr()
-
-
-# top 5 results per row
-
-print("Non-parallelized sparse_dot_topn function")
-
-rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh)',
-                    number=3,
-                    globals=globals())
-rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, mem_manager_is_C=True)',
-                    number=3,
-                    globals=globals())
-print('python\t\tC/C++', flush=True)
-print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
-
-print("Threaded function with 1 thread")
-
-rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1)',
-                    number=3,
-                    globals=globals())
-rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1, mem_manager_is_C=True)',
-                    number=3,
-                    globals=globals())
-print('python\t\tC/C++', flush=True)
-print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
-
-print("Threaded function with 2 threads")
-
-rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2)',
-                    number=3,
-                    globals=globals())
-rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2, mem_manager_is_C=True)',
-                    number=3,
-                    globals=globals())
-print('python\t\tC/C++', flush=True)
-print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
-
-print("Threaded function with 3 threads")
-
-rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3)',
-                    number=3,
-                    globals=globals())
-rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3, mem_manager_is_C=True)',
-                    number=3,
-                    globals=globals())
-print('python\t\tC/C++', flush=True)
-print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
-
-print("Threaded function with 4 threads")
-
-rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4)',
-                    number=3,
-                    globals=globals())
-rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4, mem_manager_is_C=True)',
-                    number=3,
-                    globals=globals())
-print('python\t\tC/C++', flush=True)
-print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
-
-print("Threaded function with 5 threads")
-
-rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5)',
-                    number=3,
-                    globals=globals())
-rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5, mem_manager_is_C=True)',
-                    number=3,
-                    globals=globals())
-print('python\t\tC/C++', flush=True)
-print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
-
-print("Threaded function with 6 threads")
-
-rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6)',
-                    number=3,
-                    globals=globals())
-rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6, mem_manager_is_C=True)',
-                    number=3,
-                    globals=globals())
-print('python\t\tC/C++', flush=True)
-print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
-
-print("Threaded function with 7 threads")
-
-rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7)',
-                    number=3,
-                    globals=globals())
-rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7, mem_manager_is_C=True)',
-                    number=3,
-                    globals=globals())
-print('python\t\tC/C++', flush=True)
-print(f'{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
-
-
-# use scipy and numpy function
-
-
-def get_csr_ntop_idx_data(csr_row, ntop):
-    """
-    Get list (row index, score) of the n top matches
-    """
-    nnz = csr_row.getnnz()
-    if nnz == 0:
-        return None
-    elif nnz <= ntop:
-        result = zip(csr_row.indices, csr_row.data)
-    else:
-        arg_idx = np.argpartition(csr_row.data, -ntop)[-ntop:]
-        result = zip(csr_row.indices[arg_idx], csr_row.data[arg_idx])
-
-    return sorted(result, key=lambda x: -x[1])
-
-
-def scipy_cossim_top(A, B, ntop, lower_bound=0):
-    C = A.dot(B)
-    return [get_csr_ntop_idx_data(row, ntop) for row in C]
-
-# top 5 results per row which element is greater than 2
-
-
-print("Scipy+numpy original function")
-
-rtv = timeit.timeit('scipy_cossim_top(a, b, N, thresh)',
-                    number=3,
-                    globals=globals())
-print(rtv)
+n_matrix_pairs = 2**4
+nnz_arr = np.full(n_matrix_pairs, 0)
+ntop_arr = np.full(n_matrix_pairs, 0)
+r = 0
+for it in range(n_matrix_pairs):
+    
+    row = rng1.randint(n_samples, size=nnz_a)
+    cols = rng2.randint(nr_vocab, size=nnz_a)
+    data = rng1.rand(nnz_a)
+    
+    a_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab))
+    a = a_sparse.tocsr()
+    
+    row = rng1.randint(n_duplicates, size=nnz_b)
+    cols = rng2.randint(nr_vocab, size=nnz_b)
+    data = rng1.rand(nnz_b)
+    
+    b_sparse = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab))
+    b = b_sparse.T.tocsr()
+    
+    C, C_ntop = awesome_cossim_topn(a, b, N, thresh, return_best_ntop=True)
+    print(f'nnz(A*B) = {len(C.data)}', flush=True)
+    print(f'ntop(A*B) = {C_ntop}', flush=True)
+    print('', flush=True)
+    nnz_arr[it] = len(C.data)
+    ntop_arr[it] = C_ntop
+    
+    
+    # top 5 results per row
+    
+    print("Non-parallelized sparse_dot_topn function")
+    
+    rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh)',
+                        number=3,
+                        globals=globals())
+    rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, scout_nnz=True)',
+                        number=3,
+                        globals=globals())
+    df.loc[r] = [it, 0, rtv, rtv2, 100.*(rtv2 - rtv)/rtv]
+    r += 1
+    print('sample\t\tpython\t\t+scout', flush=True)
+    print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+    
+    print("Threaded function with 1 thread")
+    
+    rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1)',
+                        number=3,
+                        globals=globals())
+    rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1, scout_nnz=True)',
+                        number=3,
+                        globals=globals())
+    df.loc[r] = [it, 1, rtv, rtv2, 100.*(rtv2 - rtv)/rtv]
+    r += 1
+    print('sample\t\tpython\t\t+scout', flush=True)
+    print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+    
+    print("Threaded function with 2 threads")
+    
+    rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2)',
+                        number=3,
+                        globals=globals())
+    rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2, scout_nnz=True)',
+                        number=3,
+                        globals=globals())
+    df.loc[r] = [it, 2, rtv, rtv2, 100.*(rtv2 - rtv)/rtv]
+    r += 1
+    print('sample\t\tpython\t\t+scout', flush=True)
+    print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+    
+    print("Threaded function with 3 threads")
+    
+    rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3)',
+                        number=3,
+                        globals=globals())
+    rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3, scout_nnz=True)',
+                        number=3,
+                        globals=globals())
+    df.loc[r] = [it, 3, rtv, rtv2, 100.*(rtv2 - rtv)/rtv]
+    r += 1
+    print('sample\t\tpython\t\t+scout', flush=True)
+    print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+    
+    print("Threaded function with 4 threads")
+    
+    rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4)',
+                        number=3,
+                        globals=globals())
+    rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4, scout_nnz=True)',
+                        number=3,
+                        globals=globals())
+    df.loc[r] = [it, 4, rtv, rtv2, 100.*(rtv2 - rtv)/rtv]
+    r += 1
+    print('sample\t\tpython\t\t+scout', flush=True)
+    print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+    
+    print("Threaded function with 5 threads")
+    
+    rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5)',
+                        number=3,
+                        globals=globals())
+    rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5, scout_nnz=True)',
+                        number=3,
+                        globals=globals())
+    df.loc[r] = [it, 5, rtv, rtv2, 100.*(rtv2 - rtv)/rtv]
+    r += 1
+    print('sample\t\tpython\t\t+scout', flush=True)
+    print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+    
+    print("Threaded function with 6 threads")
+    
+    rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6)',
+                        number=3,
+                        globals=globals())
+    rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6, scout_nnz=True)',
+                        number=3,
+                        globals=globals())
+    df.loc[r] = [it, 6, rtv, rtv2, 100.*(rtv2 - rtv)/rtv]
+    r += 1
+    print('sample\t\tpython\t\t+scout', flush=True)
+    print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+    
+    print("Threaded function with 7 threads")
+    
+    rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7)',
+                        number=3,
+                        globals=globals())
+    rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7, scout_nnz=True)',
+                        number=3,
+                        globals=globals())
+    df.loc[r] = [it, 7, rtv, rtv2, 100.*(rtv2 - rtv)/rtv]
+    r += 1
+    print('sample\t\tpython\t\t+scout', flush=True)
+    print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+    
+    print('')
+    print(f'nnz(A*B) = {nnz_arr[:(it + 1)].mean()} +/- {nnz_arr[:(it + 1)].std()}')
+    print(f'ntop(A*B) = {ntop_arr[:(it + 1)].mean()} +/- {ntop_arr[:(it + 1)].std()}')
+    print('')
+    df = df.astype({
+        'sample': np.int64, '#threads': np.int64, 'python': np.float64, '+scout': np.float64, '%inc': np.float64})
+    results = df.groupby('#threads', as_index=True, sort=True)[['python', '+scout', '%inc']].mean()
+    
+    print(results)
+    print('')
+    print('')
diff --git a/sparse_dot_topn/sparse_dot_topn.pyx b/sparse_dot_topn/sparse_dot_topn.pyx
index 580c0f2f..9728c467 100644
--- a/sparse_dot_topn/sparse_dot_topn.pyx
+++ b/sparse_dot_topn/sparse_dot_topn.pyx
@@ -5,7 +5,7 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at#
-#    http://www.apache.org/licenses/LICENSE-2.0#
+#	http://www.apache.org/licenses/LICENSE-2.0#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,7 +20,6 @@
 # distutils: language = c++
 
 from libcpp.vector cimport vector
-from array_wrappers cimport ArrayWrapper_int, ArrayWrapper_double
 
 cimport numpy as np
 import numpy as np
@@ -30,283 +29,225 @@ np.import_array()
 
 cdef extern from "sparse_dot_topn_source.h":
 
-    cdef void sparse_dot_topn_source(
-        int n_row,
-        int n_col,
-        int Ap[],
-        int Aj[],
-        double Ax[],
-        int Bp[],
-        int Bj[],
-        double Bx[],
-        int topn,
-        double lower_bound,
-        int Cp[],
-        int Cj[],
-        double Cx[]
-    );
-
-    cdef void sparse_dot_topn_extd_source(
-        int n_row,
-        int n_col,
-        int Ap[],
-        int Aj[],
-        double Ax[],
-        int Bp[],
-        int Bj[],
-        double Bx[],
-        int topn,
-        double lower_bound,
-        int Cp[],
-        int Cj[],
-        double Cx[],
-        int* nminmax
-    );
-
-    cdef void sparse_dot_free_source(
-        int n_row,
-        int n_col,
-        int Ap[],
-        int Aj[],
-        double Ax[],
-        int Bp[],
-        int Bj[],
-        double Bx[],
-        int ntop,
-        double lower_bound,
-        int Cp[],
-        vector[int]* Cj,
-        vector[double]* Cx,
-        int* n_minmax
-    );
-
-    cdef void sparse_dot_only_max_nnz_col_source(
-        int n_row,
-        int n_col,
-        int Ap[],
-        int Aj[],
-        int Bp[],
-        int Bj[],
-        int* max_nnz_col
-    );
+	cdef void sparse_dot_topn_source(
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		double Ax[],
+		int Bp[],
+		int Bj[],
+		double Bx[],
+		int topn,
+		double lower_bound,
+		int Cp[],
+		int Cj[],
+		double Cx[]
+	);
+
+	cdef void sparse_dot_topn_extd_source(
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		double Ax[],
+		int Bp[],
+		int Bj[],
+		double Bx[],
+		int topn,
+		double lower_bound,
+		int Cp[],
+		int Cj[],
+		double Cx[],
+		int* nminmax
+	);
+
+	cdef void sparse_dot_free_source(
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		double Ax[],
+		int Bp[],
+		int Bj[],
+		double Bx[],
+		int ntop,
+		double lower_bound,
+		int Cp[],
+		vector[int]* Cj,
+		vector[double]* Cx,
+		int* n_minmax
+	);
+
+	cdef int sparse_dot_only_nnz_source(
+		int n_row,
+		int n_col,
+		int Ap[],
+		int Aj[],
+		double Ax[],
+		int Bp[],
+		int Bj[],
+		double Bx[],
+		int ntop,
+		double lower_bound
+	);
 
 cpdef sparse_dot_topn(
-    int n_row,
-    int n_col,
-    np.ndarray[int, ndim=1] a_indptr,
-    np.ndarray[int, ndim=1] a_indices,
-    np.ndarray[double, ndim=1] a_data,
-    np.ndarray[int, ndim=1] b_indptr,
-    np.ndarray[int, ndim=1] b_indices,
-    np.ndarray[double, ndim=1] b_data,
-    int ntop,
-    double lower_bound,
-    np.ndarray[int, ndim=1] c_indptr,
-    np.ndarray[int, ndim=1] c_indices,
-    np.ndarray[double, ndim=1] c_data
+	int n_row,
+	int n_col,
+	np.ndarray[int, ndim=1] a_indptr,
+	np.ndarray[int, ndim=1] a_indices,
+	np.ndarray[double, ndim=1] a_data,
+	np.ndarray[int, ndim=1] b_indptr,
+	np.ndarray[int, ndim=1] b_indices,
+	np.ndarray[double, ndim=1] b_data,
+	int ntop,
+	double lower_bound,
+	np.ndarray[int, ndim=1] c_indptr,
+	np.ndarray[int, ndim=1] c_indices,
+	np.ndarray[double, ndim=1] c_data
 ):
-    """
-    Cython glue function to call sparse_dot_topn C++ implementation
-    This function will return a matrix C in CSR format, where
-    C = [sorted top n results and results > lower_bound for each row of A * B]
-
-    Input:
-        n_row: number of rows of A matrix
-        n_col: number of columns of B matrix
-
-        a_indptr, a_indices, a_data: CSR expression of A matrix
-        b_indptr, b_indices, b_data: CSR expression of B matrix
-
-        ntop: n top results
-        lower_bound: a threshold that the element of A*B must greater than
-
-    Output by reference:
-        c_indptr, c_indices, c_data: CSR expression of C matrix
-
-    N.B. A and B must be CSR format!!!
-         The type of input numpy array must be aligned with types of C++ function arguments!
-    """
-
-    cdef int* Ap = &a_indptr[0]
-    cdef int* Aj = &a_indices[0]
-    cdef double* Ax = &a_data[0]
-    cdef int* Bp = &b_indptr[0]
-    cdef int* Bj = &b_indices[0]
-    cdef double* Bx = &b_data[0]
-    cdef int* Cp = &c_indptr[0]
-    cdef int* Cj = &c_indices[0]
-    cdef double* Cx = &c_data[0]
-
-    sparse_dot_topn_source(
-        n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx
-    )
-    return
+	"""
+	Cython glue function to call sparse_dot_topn C++ implementation
+	This function will return a matrix C in CSR format, where
+	C = [sorted top n results and results > lower_bound for each row of A * B]
+
+	Input:
+		n_row: number of rows of A matrix
+		n_col: number of columns of B matrix
+
+		a_indptr, a_indices, a_data: CSR expression of A matrix
+		b_indptr, b_indices, b_data: CSR expression of B matrix
+
+		ntop: n top results
+		lower_bound: a threshold that the element of A*B must greater than
+
+	Output by reference:
+		c_indptr, c_indices, c_data: CSR expression of C matrix
+
+	N.B. A and B must be CSR format!!!
+		 The type of input numpy array must be aligned with types of C++ function arguments!
+	"""
+
+	cdef int* Ap = &a_indptr[0]
+	cdef int* Aj = &a_indices[0]
+	cdef double* Ax = &a_data[0]
+	cdef int* Bp = &b_indptr[0]
+	cdef int* Bj = &b_indices[0]
+	cdef double* Bx = &b_data[0]
+	cdef int* Cp = &c_indptr[0]
+	cdef int* Cj = &c_indices[0]
+	cdef double* Cx = &c_data[0]
+
+	sparse_dot_topn_source(
+		n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx
+	)
+	return
 
 cpdef sparse_dot_topn_extd(
-    int n_row,
-    int n_col,
-    np.ndarray[int, ndim=1] a_indptr,
-    np.ndarray[int, ndim=1] a_indices,
-    np.ndarray[double, ndim=1] a_data,
-    np.ndarray[int, ndim=1] b_indptr,
-    np.ndarray[int, ndim=1] b_indices,
-    np.ndarray[double, ndim=1] b_data,
-    int ntop,
-    double lower_bound,
-    np.ndarray[int, ndim=1] c_indptr,
-    np.ndarray[int, ndim=1] c_indices,
-    np.ndarray[double, ndim=1] c_data,
-    np.ndarray[int, ndim=1] nminmax,
+	int n_row,
+	int n_col,
+	np.ndarray[int, ndim=1] a_indptr,
+	np.ndarray[int, ndim=1] a_indices,
+	np.ndarray[double, ndim=1] a_data,
+	np.ndarray[int, ndim=1] b_indptr,
+	np.ndarray[int, ndim=1] b_indices,
+	np.ndarray[double, ndim=1] b_data,
+	int ntop,
+	double lower_bound,
+	np.ndarray[int, ndim=1] c_indptr,
+	np.ndarray[int, ndim=1] c_indices,
+	np.ndarray[double, ndim=1] c_data,
+	np.ndarray[int, ndim=1] nminmax,
 ):
-    """
-    Cython glue function to call sparse_dot_topn_extd C++
-    implementation.  This function will return a matrix C in CSR
-    format, where
-    C = [sorted top n results > lower_bound for each row of A * B]
-    The maximum number nminmax of elements per row of C (assuming 
-    n = number of columns of B) is also returned.
-
-    Input:
-        n_row: number of rows of A matrix
-        n_col: number of columns of B matrix
-
-        a_indptr, a_indices, a_data: CSR expression of A matrix
-        b_indptr, b_indices, b_data: CSR expression of B matrix
-
-        ntop: n, the number of topmost results > lower_bound for
-              each row of C
-        lower_bound: a threshold that the element of A*B must
-                     greater than
-
-    Output by reference:
-        c_indptr, c_indices, c_data: CSR expression of matrix C
-        nminmax: The maximum number of elements per row of C 
-                 (assuming ntop = n_col)
-
-    N.B. A and B must be CSR format!!!
-         The type of input numpy array must be aligned with types
-         of C++ function arguments!
-    """
-
-    cdef int* Ap = &a_indptr[0]
-    cdef int* Aj = &a_indices[0]
-    cdef double* Ax = &a_data[0]
-    cdef int* Bp = &b_indptr[0]
-    cdef int* Bj = &b_indices[0]
-    cdef double* Bx = &b_data[0]
-    cdef int* Cp = &c_indptr[0]
-    cdef int* Cj = &c_indices[0]
-    cdef double* Cx = &c_data[0]
-    cdef int* n_minmax = &nminmax[0]
-
-    sparse_dot_topn_extd_source(
-        n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax
-    )
-    return
-
-cpdef sparse_dot_free(
-    int n_row,
-    int n_col,
-    np.ndarray[int, ndim=1] a_indptr,
-    np.ndarray[int, ndim=1] a_indices,
-    np.ndarray[double, ndim=1] a_data,
-    np.ndarray[int, ndim=1] b_indptr,
-    np.ndarray[int, ndim=1] b_indices,
-    np.ndarray[double, ndim=1] b_data,
-    int ntop,
-    double lower_bound,
-    np.ndarray[int, ndim=1] c_indptr
+	"""
+	Cython glue function to call sparse_dot_topn_extd C++
+	implementation.  This function will return a matrix C in CSR
+	format, where
+	C = [sorted top n results > lower_bound for each row of A * B]
+	The maximum number nminmax of elements per row of C (assuming 
+	n = number of columns of B) is also returned.
+
+	Input:
+		n_row: number of rows of A matrix
+		n_col: number of columns of B matrix
+
+		a_indptr, a_indices, a_data: CSR expression of A matrix
+		b_indptr, b_indices, b_data: CSR expression of B matrix
+
+		ntop: n, the number of topmost results > lower_bound for
+			  each row of C
+		lower_bound: a threshold that the element of A*B must
+					 greater than
+
+	Output by reference:
+		c_indptr, c_indices, c_data: CSR expression of matrix C
+		nminmax: The maximum number of elements per row of C 
+				 (assuming ntop = n_col)
+
+	N.B. A and B must be CSR format!!!
+		 The type of input numpy array must be aligned with types
+		 of C++ function arguments!
+	"""
+
+	cdef int* Ap = &a_indptr[0]
+	cdef int* Aj = &a_indices[0]
+	cdef double* Ax = &a_data[0]
+	cdef int* Bp = &b_indptr[0]
+	cdef int* Bj = &b_indices[0]
+	cdef double* Bx = &b_data[0]
+	cdef int* Cp = &c_indptr[0]
+	cdef int* Cj = &c_indices[0]
+	cdef double* Cx = &c_data[0]
+	cdef int* n_minmax = &nminmax[0]
+
+	sparse_dot_topn_extd_source(
+		n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax
+	)
+	return
+
+
+cpdef sparse_dot_only_nnz(
+	int n_row,
+	int n_col,
+	np.ndarray[int, ndim=1] a_indptr,
+	np.ndarray[int, ndim=1] a_indices,
+	np.ndarray[double, ndim=1] a_data,
+	np.ndarray[int, ndim=1] b_indptr,
+	np.ndarray[int, ndim=1] b_indices,
+	np.ndarray[double, ndim=1] b_data,
+	int ntop,
+	double lower_bound
 ):
-    """
-    Cython glue function to call sparse_dot_free C++ implementation
-    This function will return a matrix C in CSR format, where
-    C = [sorted top n results > lower_bound for each row of A * B]
-    The maximum number nminmax of elements per row of C (assuming 
-    n = number of columns of B) is also returned.
-
-    Input:
-        n_row: number of rows of A matrix
-        n_col: number of columns of B matrix
-
-        a_indptr, a_indices, a_data: CSR expression of A matrix
-        b_indptr, b_indices, b_data: CSR expression of B matrix
-
-        ntop: n, the number of topmost results > lower_bound for 
-              each row of C
-        lower_bound: a threshold that the element of A*B must 
-                     greater than
-
-    Output by reference:
-        c_indptr: index-pointer of the CSR expression of matrix C
-
-    Returned Output:
-        c_indices, c_data: indices and data of the CSR expression
-                           of matrix C
-        nminmax: The maximum number of elements per row of C
-                 (assuming ntop = n_col)
-
-    N.B. A and B must be CSR format!!!
-         The type of input numpy array must be aligned with types
-         of C++ function arguments!
-    """
-
-    cdef int* Ap = &a_indptr[0]
-    cdef int* Aj = &a_indices[0]
-    cdef double* Ax = &a_data[0]
-    cdef int* Bp = &b_indptr[0]
-    cdef int* Bj = &b_indices[0]
-    cdef double* Bx = &b_data[0]
-    cdef int* Cp = &c_indptr[0]
-    cdef np.ndarray[int, ndim=1] nminmax = np.array([0], dtype=np.int32)
-    cdef int* n_minmax = &nminmax[0]
-    
-    cdef vector[int] vCj;
-    cdef vector[double] vCx;
-
-    sparse_dot_free_source(
-        n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, &vCj, &vCx, n_minmax
-    )
-    
-    c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0)
-    c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0)
-    
-    return c_indices, c_data, nminmax[0]
-
-
-cpdef sparse_dot_only_max_nnz_col(
-    int n_row,
-    int n_col,
-    np.ndarray[int, ndim=1] a_indptr,
-    np.ndarray[int, ndim=1] a_indices,
-    np.ndarray[int, ndim=1] b_indptr,
-    np.ndarray[int, ndim=1] b_indices,
-    np.ndarray[int, ndim=1] minmax_topn
-):
-    """
-    Cython glue function to call sparse_dot_only_minmax_topn C++ implementation
-    This function will return the maximum number of columns set
-    per row over all rows of A * B
-
-    Input:
-        n_row: number of rows of A matrix
-        n_col: number of columns of B matrix
-
-        a_indptr, a_indices: CSR indices of A matrix
-        b_indptr, b_indices: CSR indices of B matrix
-
-    Output by reference:
-        minmax_ntop: the maximum number of columns set per row over all rows of 
-                     A * B
-
-    N.B. A and B must be CSR format!!!
-         The type of input numpy array must be aligned with types of C++ function arguments!
-    """
-
-    cdef int* Ap = &a_indptr[0]
-    cdef int* Aj = &a_indices[0]
-    cdef int* Bp = &b_indptr[0]
-    cdef int* Bj = &b_indices[0]
-    cdef int* o_minmax_topn = &minmax_topn[0]
-
-    sparse_dot_only_max_nnz_col_source(n_row, n_col, Ap, Aj, Bp, Bj, o_minmax_topn)
-    return
+	"""
+	Cython glue function to call sparse_dot_nnz_only C++ implementation
+	This function will return nnz, the total number of nonzero
+	matrix-components of
+	C = [top n results > lower_bound for each row of A * B].
+
+	Input:
+		a_indptr, a_indices, a_data: CSR expression of A matrix
+		b_indptr, b_indices, b_data: CSR expression of B matrix
+
+		ntop: n, the number of topmost results > lower_bound for 
+			  each row of C
+		lower_bound: a threshold that the element of A*B must 
+					 greater than
+
+	Returned output:
+		nnz: the total number of nonzero matrix-components of C
+
+	N.B. A and B must be CSR format!!!
+		 The type of input numpy array must be aligned with types of C++ function arguments!
+	"""
+
+	cdef int* Ap = &a_indptr[0]
+	cdef int* Aj = &a_indices[0]
+	cdef double* Ax = &a_data[0]
+	cdef int* Bp = &b_indptr[0]
+	cdef int* Bj = &b_indices[0]
+	cdef double* Bx = &b_data[0]
+
+	return sparse_dot_only_nnz_source(
+		n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound
+	)
diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/sparse_dot_topn/sparse_dot_topn_parallel.cpp
index 1b06e927..8d8fadc6 100644
--- a/sparse_dot_topn/sparse_dot_topn_parallel.cpp
+++ b/sparse_dot_topn/sparse_dot_topn_parallel.cpp
@@ -170,7 +170,6 @@ void inner_sparse_dot_topn(
 		*(row_nnz_ptr++) = len;
 		(*total) += len;
 	}
-	real_candidates->shrink_to_fit();
 }
 
 void sparse_dot_topn_parallel(
@@ -335,7 +334,6 @@ void inner_sparse_dot_topn_extd(
 		*(row_nnz_ptr++) = len;
 		(*total) += len;
 	}
-	real_candidates->shrink_to_fit();
 }
 
 void sparse_dot_topn_extd_parallel(
@@ -412,147 +410,98 @@ void sparse_dot_topn_extd_parallel(
 		thread_list[job_nr].join();
 }
 
-void sparse_dot_free_parallel(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		double Ax[], //data of A
-		int Bp[],
-		int Bj[],
-		double Bx[], //data of B
-		int ntop,
-		double lower_bound,
-		int Cp[],
-		std::vector<int>* vCj,
-		std::vector<double>* vCx,
-		int* n_minmax,
-		int n_jobs
-)
-{
-	std::vector<job_range_type> job_ranges(n_jobs);
-	distribute_load(n_row, n_jobs, job_ranges);
-
-	std::vector<std::vector<candidate>> real_candidates(n_jobs);
-	std::vector<std::vector<int>> row_nnz(n_jobs);
-
-	// initialize aggregates:
-	std::vector<int> sub_total(n_jobs, 0);
-	std::vector<int> split_n_minmax(n_jobs, 0);
-
-	// execute the jobs:
-	std::vector<std::thread> thread_list(n_jobs);
-	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
-
-		thread_list[job_nr] = std::thread (
-				inner_sparse_dot_topn_extd,
-				job_ranges[job_nr],
-				n_col,
-				ntop, lower_bound,
-				Ap, Aj, Ax, Bp, Bj, Bx,
-				&real_candidates[job_nr],
-				&row_nnz[job_nr],
-				&sub_total[job_nr],
-				&split_n_minmax[job_nr]
-		);
-	}
-
-	for (int job_nr = 0; job_nr < n_jobs; job_nr++)
-		thread_list[job_nr].join();
-
-	// gather the results (in parallel):
-	*n_minmax = *std::max_element(split_n_minmax.begin(), split_n_minmax.end());
-
-	std::vector<int> start_points(n_jobs + 1);
-	start_points[0] = 0;
-	std::partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1);
-
-	int total = start_points.back();
-	vCj->resize(total);
-	vCj->shrink_to_fit();
-	vCx->resize(total);
-	vCx->shrink_to_fit();
-
-	Cp[0] = 0;
-	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
-
-		thread_list[job_nr] = std::thread(
-				inner_gather_function,
-				job_ranges[job_nr],
-				Cp,
-				start_points[job_nr],
-				&((*vCj)[0]),
-				&((*vCx)[0]),
-				&real_candidates[job_nr],
-				&row_nnz[job_nr]
-		);
-	}
-
-	for (int job_nr = 0; job_nr < n_jobs; job_nr++)
-		thread_list[job_nr].join();
-}
-
-void inner_sparse_only_max_nnz_col(
+void inner_sparse_nnz_only(
 		job_range_type job_range,
 		int n_col_inner,
+		int ntop_inner,
+		double lower_bound_inner,
 		int Ap_copy[],
 		int Aj_copy[],
+		double Ax_copy[],
 		int Bp_copy[],
 		int Bj_copy[],
-		int *max_nnz_col	// already initialized to 0
+		double Bx_copy[],
+		int* nnz
 )
 {
-	std::vector<bool> unmarked(n_col_inner, true);
+
+	std::vector<int> next(n_col_inner,-1);
+	std::vector<double> sums(n_col_inner, 0);
 
 	for(int i = job_range.begin; i < job_range.end; i++){
 
+		int head   = -2;
 		int length =  0;
+		int candidates_sz = 0;
 
 		int jj_start = Ap_copy[i];
-		int jj_end   = Ap_copy[i+1];
+		int jj_end   = Ap_copy[i + 1];
 
 		for(int jj = jj_start; jj < jj_end; jj++){
 			int j = Aj_copy[jj];
+			double v = Ax_copy[jj]; //value of A in (i,j)
 
 			int kk_start = Bp_copy[j];
-			int kk_end   = Bp_copy[j+1];
+			int kk_end   = Bp_copy[j + 1];
 			for(int kk = kk_start; kk < kk_end; kk++){
 				int k = Bj_copy[kk]; //kth column of B in row j
 
-				if(unmarked[k]){	// if this k is not already marked then ...
-					unmarked[k] = false;	// keep a record of column k
+				sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
+
+				if(next[k] == -1){
+					next[k] = head; //keep a linked list, every element points to the next column index
+					head  = k;
 					length++;
 				}
 			}
 		}
-		*max_nnz_col = (length > *max_nnz_col)? length : *max_nnz_col;
+
+		for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
+
+			if(sums[head] > lower_bound_inner) candidates_sz++;
+
+			int temp = head;
+			head = next[head]; //iterate over columns
+
+			next[temp] = -1; //clear arrays
+			sums[temp] =  0; //clear arrays
+		}
+
+		if (candidates_sz > ntop_inner) candidates_sz = ntop_inner;
+
+		(*nnz) += candidates_sz;
 	}
 }
 
-void sparse_dot_only_max_nnz_col_parallel(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		int Bp[],
-		int Bj[],
-		int *max_nnz_col,
-		int n_jobs
+int sparse_dot_only_nnz_parallel(
+	int n_row,
+	int n_col,
+	int Ap[],
+	int Aj[],
+	double Ax[],
+	int Bp[],
+	int Bj[],
+	double Bx[],
+	int ntop,
+	double lower_bound,
+	int n_jobs
 )
 {
-	std::vector<job_range_type> job_ranges(n_jobs);
-	distribute_load(n_row, n_jobs, job_ranges);
+	std::vector<job_range_type> job_row_ranges(n_jobs);
+	distribute_load(n_row, n_jobs, job_row_ranges);
 
-	std::vector<int> split_max_nnz_col(n_jobs, 0);
+	std::vector<int> split_nnz(n_jobs, 0);
 	std::vector<std::thread> thread_list(n_jobs);
+
 	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
 
 		thread_list[job_nr] = std::thread (
-				inner_sparse_only_max_nnz_col,
-				job_ranges[job_nr],
+				inner_sparse_nnz_only,
+				job_row_ranges[job_nr],
 				n_col,
-				Ap, Aj, Bp, Bj,
-				&split_max_nnz_col[job_nr]
+				ntop, lower_bound,
+				Ap, Aj, Ax, Bp, Bj, Bx,
+				&split_nnz[job_nr]
 		);
 
 	}
@@ -560,5 +509,6 @@ void sparse_dot_only_max_nnz_col_parallel(
 	for (int job_nr = 0; job_nr < n_jobs; job_nr++)
 		thread_list[job_nr].join();
 
-	*max_nnz_col = *max_element(split_max_nnz_col.begin(), split_max_nnz_col.end());
+	return std::accumulate(split_nnz.begin(), split_nnz.end(), (int) 0);
 }
+
diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.h b/sparse_dot_topn/sparse_dot_topn_parallel.h
index 716ca04e..0099917e 100644
--- a/sparse_dot_topn/sparse_dot_topn_parallel.h
+++ b/sparse_dot_topn/sparse_dot_topn_parallel.h
@@ -58,33 +58,18 @@ extern void sparse_dot_topn_extd_parallel(
 		int n_jobs
 );
 
-extern void sparse_dot_free_parallel(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		double Ax[], //data of A
-		int Bp[],
-		int Bj[],
-		double Bx[], //data of B
-		int ntop,
-		double lower_bound,
-		int Cp[],
-		std::vector<int>* Cj,
-		std::vector<double>* Cx,
-		int* n_minmax,
-		int njobs
-);
-
-extern void sparse_dot_only_max_nnz_col_parallel(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		int Bp[],
-		int Bj[],
-		int *max_nnz_col,
-		int n_jobs
+extern int sparse_dot_only_nnz_parallel(
+	int n_row,
+	int n_col,
+	int Ap[],
+	int Aj[],
+	double Ax[],
+	int Bp[],
+	int Bj[],
+	double Bx[],
+	int ntop,
+	double lower_bound,
+	int n_jobs
 );
 
 #endif //UTILS_CPPCLASS_H
diff --git a/sparse_dot_topn/sparse_dot_topn_source.cpp b/sparse_dot_topn/sparse_dot_topn_source.cpp
index 17b8b121..0cc14e62 100644
--- a/sparse_dot_topn/sparse_dot_topn_source.cpp
+++ b/sparse_dot_topn/sparse_dot_topn_source.cpp
@@ -247,12 +247,14 @@ void sparse_dot_topn_extd_source(
 }
 
 /*
-	C++ implementation of sparse_dot_free_source
+	C++ implementation of sparse_dot_nnz_source
 
-	This function will return a matrix C in CSR format, where
-	C = [sorted top n results > lower_bound for each row of A * B].
-	The maximum number n_minmax of elements per row of C (assuming ntop = n_col)
-	is also returned.
+	This function will return the number nnz of nonzero elements
+	of the matrix C in CSR format, where
+	C = [all results > lower_bound sorted for each row of A * B]
+	and ntop the maximum number of elements per row of C.
+	This function is designed primarily to help with memory management for
+	very large sparse matrices.
 
 	Input:
 		n_row: number of rows of A matrix
@@ -261,18 +263,15 @@ void sparse_dot_topn_extd_source(
 		Ap, Aj, Ax: CSR expression of A matrix
 		Bp, Bj, Bx: CSR expression of B matrix
 
-		ntop: n top results
 		lower_bound: a threshold that the element of A*B must greater than
 
-	Output by reference:
-		Cp: C array for idx_pointer of CSR expression of C matrix
-		Cj: STL vector for indices of CSR expression of C matrix
-		Cx: STL vector for data values of CSR expression of C matrix
-		n_minmax: the maximum number of elements per row of C
+	Output:
+		nnz: number of nonzero elements of matrix C
+		ntop: maximum number of elements per row of C
 
 	N.B. A and B must be CSR format!!!
 */
-void sparse_dot_free_source(
+void sparse_dot_nnz_source(
 		int n_row,
 		int n_col,
 		int Ap[],
@@ -281,25 +280,16 @@ void sparse_dot_free_source(
 		int Bp[],
 		int Bj[],
 		double Bx[], //data of B
-		int ntop,
 		double lower_bound,
-		int Cp[],
-		std::vector<int>* Cj,
-		std::vector<double>* Cx,
-		int* n_minmax
+		int* nnz,
+		int* ntop
 )
 {
-	*n_minmax = 0;
-	int sz = std::max(n_row, n_col);
-	Cj->reserve(sz);
-	Cx->reserve(sz);
-
 	std::vector<int> next(n_col,-1);
 	std::vector<double> sums(n_col, 0);
 
-	std::vector<candidate> candidates;
-
-	Cp[0] = 0;
+	*nnz = 0;
+	*ntop = 0;
 
 	for(int i = 0; i < n_row; i++){
 		int head   = -2;
@@ -326,14 +316,10 @@ void sparse_dot_free_source(
 			}
 		}
 
+		int nnz_k = 0;
 		for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
 
-			if(sums[head] > lower_bound){ //append the nonzero elements
-				candidate c;
-				c.index = head;
-				c.value = sums[head];
-				candidates.push_back(c);
-			}
+			if(sums[head] > lower_bound) nnz_k++; //count this nonzero element in
 
 			int temp = head;
 			head = next[head]; //iterate over columns
@@ -341,36 +327,17 @@ void sparse_dot_free_source(
 			next[temp] = -1; //clear arrays
 			sums[temp] =  0; //clear arrays
 		}
-
-		int len = (int)candidates.size();
-		*n_minmax = (len > *n_minmax)? len : *n_minmax;
-
-		if (len > ntop){
-			std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp);
-			len = ntop;
-		} else {
-			std::sort(candidates.begin(), candidates.end(), candidate_cmp);
-		}
-
-		for(int a=0; a < len; a++){
-			Cj->push_back(candidates[a].index);
-			Cx->push_back(candidates[a].value);
-		}
-		candidates.clear();
-
-		Cp[i+1] = Cj->size();
+		*ntop = (nnz_k > *ntop)? nnz_k : *ntop;
+		*nnz += nnz_k;
 	}
 }
 
 /*
-	C++ implementation of sparse_dot_nnz_source
+	C++ implementation of sparse_dot_only_max_nnz_col_source
 
-	This function will return the number nnz of nonzero elements
-	of the matrix C in CSR format, where
-	C = [all results > lower_bound sorted for each row of A * B]
-	and ntop the maximum number of elements per row of C.
-	This function is designed primarily to help with memory management for
-	very large sparse matrices.
+	This function will return nnz, the total number of nonzero
+	matrix-components of
+	C = [top n results > lower_bound for each row of A * B].
 
 	Input:
 		n_row: number of rows of A matrix
@@ -379,15 +346,15 @@ void sparse_dot_free_source(
 		Ap, Aj, Ax: CSR expression of A matrix
 		Bp, Bj, Bx: CSR expression of B matrix
 
+		ntop: top n results
 		lower_bound: a threshold that the element of A*B must greater than
 
-	Output:
-		nnz: number of nonzero elements of matrix C
-		ntop: maximum number of elements per row of C
+	Returned output:
+		nnz: the total number of nonzero matrix-components of C
 
 	N.B. A and B must be CSR format!!!
 */
-void sparse_dot_nnz_source(
+int sparse_dot_only_nnz_source(
 		int n_row,
 		int n_col,
 		int Ap[],
@@ -396,20 +363,19 @@ void sparse_dot_nnz_source(
 		int Bp[],
 		int Bj[],
 		double Bx[], //data of B
-		double lower_bound,
-		int* nnz,
-		int* ntop
+		int ntop,
+		double lower_bound
 )
 {
 	std::vector<int> next(n_col,-1);
 	std::vector<double> sums(n_col, 0);
 
-	*nnz = 0;
-	*ntop = 0;
+	int nnz = 0;
 
 	for(int i = 0; i < n_row; i++){
 		int head   = -2;
 		int length =  0;
+		int candidates_sz = 0;
 
 		int jj_start = Ap[i];
 		int jj_end   = Ap[i+1];
@@ -432,10 +398,9 @@ void sparse_dot_nnz_source(
 			}
 		}
 
-		int nnz_k = 0;
 		for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
 
-			if(sums[head] > lower_bound) nnz_k++; //count this nonzero element in
+			if(sums[head] > lower_bound) candidates_sz++;
 
 			int temp = head;
 			head = next[head]; //iterate over columns
@@ -443,63 +408,10 @@ void sparse_dot_nnz_source(
 			next[temp] = -1; //clear arrays
 			sums[temp] =  0; //clear arrays
 		}
-		*ntop = (nnz_k > *ntop)? nnz_k : *ntop;
-		*nnz += nnz_k;
-	}
-}
 
-/*
-	C++ implementation of sparse_dot_only_max_nnz_col_source
+		if (candidates_sz > ntop) candidates_sz = ntop;
 
-	This function will return the maximum number of columns set
-	per row over all rows of A * B
-
-	Input:
-		n_row: number of rows of A matrix
-		n_col: number of columns of B matrix
-
-		Ap, Aj, Ax: CSR expression of A matrix
-		Bp, Bj, Bx: CSR expression of B matrix
-
-	Output by reference:
-		max_nnz_col: the maximum number of columns set per row
-					 over all rows of A * B
-
-	N.B. A and B must be CSR format!!!
-*/
-void sparse_dot_only_max_nnz_col_source(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		int Bp[],
-		int Bj[],
-		int *max_nnz_col
-)
-{
-	std::vector<bool> unmarked(n_col, true);
-
-	*max_nnz_col = 0;
-
-	for(int i = 0; i < n_row; i++){
-		int length =  0;
-
-		int jj_start = Ap[i];
-		int jj_end   = Ap[i+1];
-		for(int jj = jj_start; jj < jj_end; jj++){
-			int j = Aj[jj];
-
-			int kk_start = Bp[j];
-			int kk_end   = Bp[j+1];
-			for(int kk = kk_start; kk < kk_end; kk++){
-				int k = Bj[kk];	// kth column of B in row j
-
-				if(unmarked[k]){	// if this k is not already marked then ...
-					unmarked[k] = false;	// keep a record of column k
-					length++;
-				}
-			}
-		}
-		*max_nnz_col = (length > *max_nnz_col)? length : *max_nnz_col;
+		nnz += candidates_sz;
 	}
+	return nnz;
 }
diff --git a/sparse_dot_topn/sparse_dot_topn_source.h b/sparse_dot_topn/sparse_dot_topn_source.h
index 9580d1cf..7975a75b 100644
--- a/sparse_dot_topn/sparse_dot_topn_source.h
+++ b/sparse_dot_topn/sparse_dot_topn_source.h
@@ -61,7 +61,7 @@ extern void sparse_dot_topn_extd_source(
 		int* n_minmax
 );
 
-extern void sparse_dot_free_source(
+extern int sparse_dot_only_nnz_source(
 		int n_row,
 		int n_col,
 		int Ap[],
@@ -71,21 +71,7 @@ extern void sparse_dot_free_source(
 		int Bj[],
 		double Bx[], //data of B
 		int ntop,
-		double lower_bound,
-		int Cp[],
-		std::vector<int>* Cj,
-		std::vector<double>* Cx,
-		int* n_minmax
-);
-
-extern void sparse_dot_only_max_nnz_col_source(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		int Bp[],
-		int Bj[],
-		int *max_nnz_col
+		double lower_bound
 );
 
 #endif //UTILS_CPPCLASS_H
diff --git a/sparse_dot_topn/sparse_dot_topn_threaded.pyx b/sparse_dot_topn/sparse_dot_topn_threaded.pyx
index 84999abc..ad95fbb9 100644
--- a/sparse_dot_topn/sparse_dot_topn_threaded.pyx
+++ b/sparse_dot_topn/sparse_dot_topn_threaded.pyx
@@ -20,7 +20,6 @@
 # distutils: language = c++
 
 from libcpp.vector cimport vector
-from array_wrappers cimport ArrayWrapper_int, ArrayWrapper_double
 
 cimport numpy as np
 import numpy as np
@@ -66,7 +65,7 @@ cdef extern from "sparse_dot_topn_parallel.h":
 		int n_jobs
 	);
 
-	cdef void sparse_dot_free_parallel(
+	cdef int sparse_dot_only_nnz_parallel(
 		int n_row,
 		int n_col,
 		int Ap[],
@@ -77,21 +76,6 @@ cdef extern from "sparse_dot_topn_parallel.h":
 		double Bx[],
 		int ntop,
 		double lower_bound,
-		int Cp[],
-		vector[int]* Cj,
-		vector[double]* Cx,
-		int* n_minmax,
-		int n_jobs
-	);
-
-	cdef void sparse_dot_only_max_nnz_col_parallel(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		int Bp[],
-		int Bj[],
-		int* max_nnz_col,
 		int n_jobs
 	);
 
@@ -161,7 +145,7 @@ cpdef sparse_dot_topn_extd_threaded(
 	)
 	return
 
-cpdef sparse_dot_free_threaded(
+cpdef sparse_dot_only_nnz_threaded(
 	int n_row,
 	int n_col,
 	np.ndarray[int, ndim=1] a_indptr,
@@ -172,7 +156,6 @@ cpdef sparse_dot_free_threaded(
 	np.ndarray[double, ndim=1] b_data,
 	int ntop,
 	double lower_bound,
-	np.ndarray[int, ndim=1] c_indptr,
 	int n_jobs
 ):
 
@@ -182,38 +165,7 @@ cpdef sparse_dot_free_threaded(
 	cdef int* Bp = &b_indptr[0]
 	cdef int* Bj = &b_indices[0]
 	cdef double* Bx = &b_data[0]
-	cdef int* Cp = &c_indptr[0]
-	cdef np.ndarray[int, ndim=1] nminmax = np.array([0], dtype=np.int32)
-	cdef int* n_minmax = &nminmax[0]
 
-	cdef vector[int] vCj;
-	cdef vector[double] vCx;
-
-	sparse_dot_free_parallel(
-		n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, &vCj, &vCx, n_minmax, n_jobs
+	return sparse_dot_only_nnz_parallel(
+		n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, n_jobs
 	)
-	
-	c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0)
-	c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0)
-	
-	return c_indices, c_data, nminmax[0]
-
-cpdef sparse_dot_only_max_nnz_col_threaded(
-	int n_row,
-	int n_col,
-	np.ndarray[int, ndim=1] a_indptr,
-	np.ndarray[int, ndim=1] a_indices,
-	np.ndarray[int, ndim=1] b_indptr,
-	np.ndarray[int, ndim=1] b_indices,
-	np.ndarray[int, ndim=1] max_nnz_col,
-	int n_jobs
-):
-
-	cdef int* Ap = &a_indptr[0]
-	cdef int* Aj = &a_indices[0]
-	cdef int* Bp = &b_indptr[0]
-	cdef int* Bj = &b_indices[0]
-	cdef int* o_max_nnz_col = &max_nnz_col[0]
-
-	sparse_dot_only_max_nnz_col_parallel(n_row, n_col, Ap, Aj, Bp, Bj, o_max_nnz_col, n_jobs)
-	return
diff --git a/sparse_dot_topn/test/test_awesome_cossim_topn.py b/sparse_dot_topn/test/test_awesome_cossim_topn.py
index ba7dfbfc..1a631179 100644
--- a/sparse_dot_topn/test/test_awesome_cossim_topn.py
+++ b/sparse_dot_topn/test/test_awesome_cossim_topn.py
@@ -11,7 +11,7 @@
 
 PRUNE_THRESHOLD = 0.1
 NUM_CANDIDATES = 3
-MEM_MANAGER_IS_C = True
+SCOUT_NNZ = True
 USE_THREADS = True
 MAX_N_PROCESSES = min(8, multiprocessing.cpu_count()) - 1
 
@@ -38,7 +38,7 @@ def get_n_top_sparse(mat, n_top=10):
 def helper_awesome_cossim_topn_dense(
         a_dense,
         b_dense,
-        mem_manager_is_C=False,
+        scout_nnz=False,
         use_threads=False,
         n_jobs=1
     ):
@@ -58,7 +58,7 @@ def helper_awesome_cossim_topn_dense(
     awesome_result = awesome_cossim_topn(
         a_csr, b_csr_t, len(b_dense),
         0.0,
-        mem_manager_is_C=mem_manager_is_C, 
+        scout_nnz=scout_nnz, 
         use_threads=use_threads,
         n_jobs=n_jobs
     )
@@ -67,7 +67,7 @@ def helper_awesome_cossim_topn_dense(
         b_csr_t,
         NUM_CANDIDATES,
         0.0,
-        mem_manager_is_C=mem_manager_is_C,
+        scout_nnz=scout_nnz,
         use_threads=use_threads,
         n_jobs=n_jobs
     )
@@ -79,7 +79,7 @@ def helper_awesome_cossim_topn_dense(
         b_csr_t,
         len(b_dense),
         PRUNE_THRESHOLD,
-        mem_manager_is_C=mem_manager_is_C,
+        scout_nnz=scout_nnz,
         use_threads=use_threads,
         n_jobs=n_jobs
     )
@@ -88,7 +88,7 @@ def helper_awesome_cossim_topn_dense(
         b_csr_t,
         NUM_CANDIDATES,
         PRUNE_THRESHOLD,
-        mem_manager_is_C=mem_manager_is_C,
+        scout_nnz=scout_nnz,
         use_threads=use_threads,
         n_jobs=n_jobs
     )
@@ -119,7 +119,7 @@ def helper_awesome_cossim_topn_sparse(
         a_sparse,
         b_sparse,
         flag=True,
-        mem_manager_is_C=False,
+        scout_nnz=False,
         use_threads=False,
         n_jobs=1
     ):
@@ -141,7 +141,7 @@ def helper_awesome_cossim_topn_sparse(
         b_csr_t,
         b_sparse.shape[0],
         0.0,
-        mem_manager_is_C=mem_manager_is_C,
+        scout_nnz=scout_nnz,
         use_threads=use_threads,
         n_jobs=n_jobs
     )
@@ -150,7 +150,7 @@ def helper_awesome_cossim_topn_sparse(
         b_csr_t,
         NUM_CANDIDATES,
         0.0,
-        mem_manager_is_C=mem_manager_is_C,
+        scout_nnz=scout_nnz,
         use_threads=use_threads,
         n_jobs=n_jobs
     )
@@ -162,7 +162,7 @@ def helper_awesome_cossim_topn_sparse(
         b_csr_t,
         b_sparse.shape[0],
         PRUNE_THRESHOLD,
-        mem_manager_is_C=mem_manager_is_C,
+        scout_nnz=scout_nnz,
         use_threads=use_threads,
         n_jobs=n_jobs
     )
@@ -171,7 +171,7 @@ def helper_awesome_cossim_topn_sparse(
         b_csr_t,
         NUM_CANDIDATES,
         PRUNE_THRESHOLD,
-        mem_manager_is_C=mem_manager_is_C,
+        scout_nnz=scout_nnz,
         use_threads=use_threads,
         n_jobs=n_jobs
     )
@@ -216,14 +216,14 @@ def test_awesome_cossim_topn_manually():
                [0.6, 0.1, 0.2, 0.8, 0.1],
                [0.9, 0.1, 0.6, 0.4, 0.3]]
     helper_awesome_cossim_topn_dense(a_dense, b_dense)
-    helper_awesome_cossim_topn_dense(a_dense, b_dense, mem_manager_is_C=MEM_MANAGER_IS_C)
+    helper_awesome_cossim_topn_dense(a_dense, b_dense, scout_nnz=SCOUT_NNZ)
     for process in range(MAX_N_PROCESSES):
         n_jobs = process + 1
         helper_awesome_cossim_topn_dense(a_dense, b_dense, use_threads=USE_THREADS, n_jobs=n_jobs)
         helper_awesome_cossim_topn_dense(
             a_dense,
             b_dense,
-            mem_manager_is_C=MEM_MANAGER_IS_C,
+            scout_nnz=SCOUT_NNZ,
             use_threads=USE_THREADS,
             n_jobs=n_jobs
         )
@@ -240,14 +240,14 @@ def test_awesome_cossim_topn_manually():
                [0, 0, 0, 0.1, 0.3],
                [0, 0, 0, 0.7, 0.5]]
     helper_awesome_cossim_topn_dense(c_dense, d_dense)
-    helper_awesome_cossim_topn_dense(c_dense, d_dense, mem_manager_is_C=MEM_MANAGER_IS_C)
+    helper_awesome_cossim_topn_dense(c_dense, d_dense, scout_nnz=SCOUT_NNZ)
     for process in range(MAX_N_PROCESSES):
         n_jobs = process + 1
         helper_awesome_cossim_topn_dense(c_dense, d_dense, use_threads=USE_THREADS, n_jobs=n_jobs)
         helper_awesome_cossim_topn_dense(
             c_dense,
             d_dense,
-            mem_manager_is_C=MEM_MANAGER_IS_C,
+            scout_nnz=SCOUT_NNZ,
             use_threads=USE_THREADS,
             n_jobs=n_jobs
         )
@@ -264,14 +264,14 @@ def test_awesome_cossim_top_one_zeros():
         a_sparse = csr_matrix(np.zeros((1, nr_vocab)))
         b_sparse = rand(800, nr_vocab, density=density, format='csr')
         helper_awesome_cossim_topn_sparse(a_sparse, b_sparse)
-        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, mem_manager_is_C=MEM_MANAGER_IS_C)
+        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, scout_nnz=SCOUT_NNZ)
         for process in range(MAX_N_PROCESSES):
             n_jobs = process + 1
             helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, use_threads=USE_THREADS, n_jobs=n_jobs)
             helper_awesome_cossim_topn_sparse(
                 a_sparse,
                 b_sparse,
-                mem_manager_is_C=MEM_MANAGER_IS_C,
+                scout_nnz=SCOUT_NNZ,
                 use_threads=USE_THREADS,
                 n_jobs=n_jobs
             )
@@ -288,14 +288,14 @@ def test_awesome_cossim_top_all_zeros():
         a_sparse = csr_matrix(np.zeros((2, nr_vocab)))
         b_sparse = rand(800, nr_vocab, density=density, format='csr')
         helper_awesome_cossim_topn_sparse(a_sparse, b_sparse)
-        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, mem_manager_is_C=MEM_MANAGER_IS_C)
+        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, scout_nnz=SCOUT_NNZ)
         for process in range(MAX_N_PROCESSES):
             n_jobs = process + 1
             helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, use_threads=USE_THREADS, n_jobs=n_jobs)
             helper_awesome_cossim_topn_sparse(
                 a_sparse,
                 b_sparse,
-                mem_manager_is_C=MEM_MANAGER_IS_C,
+                scout_nnz=SCOUT_NNZ,
                 use_threads=USE_THREADS,
                 n_jobs=n_jobs
             )
@@ -311,7 +311,7 @@ def test_awesome_cossim_top_small_matrix():
         a_sparse = rand(300, nr_vocab, density=density, format='csr')
         b_sparse = rand(800, nr_vocab, density=density, format='csr')
         helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False)
-        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, mem_manager_is_C=MEM_MANAGER_IS_C)
+        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, scout_nnz=SCOUT_NNZ)
         for process in range(MAX_N_PROCESSES):
             n_jobs = process + 1
             helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, use_threads=USE_THREADS, n_jobs=n_jobs)
@@ -319,7 +319,7 @@ def test_awesome_cossim_top_small_matrix():
                 a_sparse,
                 b_sparse,
                 False,
-                mem_manager_is_C=MEM_MANAGER_IS_C,
+                scout_nnz=SCOUT_NNZ,
                 use_threads=USE_THREADS,
                 n_jobs=n_jobs
             )
@@ -360,7 +360,7 @@ def test_awesome_cossim_top_large_matrix():
         b_sparse = b_sparse.tocsr()
 
         helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False)
-        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, mem_manager_is_C=MEM_MANAGER_IS_C)
+        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, scout_nnz=SCOUT_NNZ)
         for process in range(MAX_N_PROCESSES):
             n_jobs = process + 1
             helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, use_threads=USE_THREADS, n_jobs=n_jobs)
@@ -368,7 +368,7 @@ def test_awesome_cossim_top_large_matrix():
                 a_sparse,
                 b_sparse,
                 False,
-                mem_manager_is_C=MEM_MANAGER_IS_C,
+                scout_nnz=SCOUT_NNZ,
                 use_threads=USE_THREADS,
                 n_jobs=n_jobs
             )
diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index 4ea5e380..61903d5f 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -437,7 +437,7 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix
         tf_idf_matrix_2 = duplicate_matrix.transpose()
         
         optional_kwargs = {
-            'return_best_topn': True,
+            'return_best_ntop': True,
             'use_threads': self._config.number_of_processes > 1,
             'n_jobs': self._config.number_of_processes
         }

From 6b7ee4b84d912bcdcf32afe5c47d37a8f3353419 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Sun, 2 May 2021 23:16:12 +0200
Subject: [PATCH 15/29] introduced heuristic to reduce over-estimate of memory
 allocation for the matrix product

---
 setup.py                                      | 10 ++-
 sparse_dot_topn/array_wrappers.pxd            | 16 ++++
 sparse_dot_topn/array_wrappers.pyx            | 73 +++++++++++++++
 sparse_dot_topn/awesome_cossim_topn.py        | 71 ++++++---------
 sparse_dot_topn/example/comparison2.py        | 88 +++++++------------
 sparse_dot_topn/example/comparison3.py        | 61 +++++++++++++
 sparse_dot_topn/sparse_dot_topn.pyx           | 56 +++++++-----
 sparse_dot_topn/sparse_dot_topn_parallel.cpp  | 28 +++++-
 sparse_dot_topn/sparse_dot_topn_parallel.h    |  5 +-
 sparse_dot_topn/sparse_dot_topn_source.cpp    | 40 +++++++--
 sparse_dot_topn/sparse_dot_topn_source.h      |  5 +-
 sparse_dot_topn/sparse_dot_topn_threaded.pyx  | 29 ++++--
 .../test/test_awesome_cossim_topn.py          | 61 -------------
 13 files changed, 342 insertions(+), 201 deletions(-)
 create mode 100644 sparse_dot_topn/array_wrappers.pxd
 create mode 100644 sparse_dot_topn/array_wrappers.pyx
 create mode 100644 sparse_dot_topn/example/comparison3.py

diff --git a/setup.py b/setup.py
index c47aa78b..cf5d5fee 100644
--- a/setup.py
+++ b/setup.py
@@ -29,6 +29,14 @@ def finalize_options(self):
 else:
     extra_compile_args = ['-std=c++0x', '-pthread', '-O3']
 
+array_wrappers_ext = Extension('sparse_dot_topn.array_wrappers',
+                         sources=[
+                                    './sparse_dot_topn/array_wrappers.pyx',
+                                    './sparse_dot_topn/sparse_dot_topn_source.cpp'
+                                ],
+                         extra_compile_args=extra_compile_args,
+                         language='c++')
+
 original_ext = Extension('sparse_dot_topn.sparse_dot_topn',
                          sources=[
                                     './sparse_dot_topn/sparse_dot_topn.pyx',
@@ -82,5 +90,5 @@ def finalize_options(self):
                       , 'pandas>=0.25.3'
     ],
     cmdclass={'build_ext': my_build_ext},
-    ext_modules=[original_ext, threaded_ext]
+    ext_modules=[array_wrappers_ext, original_ext, threaded_ext],
 )
diff --git a/sparse_dot_topn/array_wrappers.pxd b/sparse_dot_topn/array_wrappers.pxd
new file mode 100644
index 00000000..3af1a3c4
--- /dev/null
+++ b/sparse_dot_topn/array_wrappers.pxd
@@ -0,0 +1,16 @@
+from libcpp.vector cimport vector
+
+# define a Cython array wrapper class to hold a C++ vector of ints, adhering to numpy's buffer protocol:
+cdef class ArrayWrapper_int:
+    cdef int view_count
+    cdef vector[int] vec
+    cdef Py_ssize_t shape[2]
+    cdef Py_ssize_t strides[2]
+
+
+# define a Cython array wrapper class to hold a C++ vector of doubles, adhering to numpy's buffer protocol:
+cdef class ArrayWrapper_double:
+    cdef int view_count
+    cdef vector[double] vec
+    cdef Py_ssize_t shape[2]
+    cdef Py_ssize_t strides[2]
diff --git a/sparse_dot_topn/array_wrappers.pyx b/sparse_dot_topn/array_wrappers.pyx
new file mode 100644
index 00000000..18525766
--- /dev/null
+++ b/sparse_dot_topn/array_wrappers.pyx
@@ -0,0 +1,73 @@
+from cpython cimport Py_buffer
+from libcpp.vector cimport vector
+
+# define a Cython array wrapper class to hold a C++ vector of ints, adhering to numpy's buffer protocol:
+cdef class ArrayWrapper_int:
+    # constructor and destructor are fairly unimportant now since
+    # vec will be destroyed automatically.
+
+    def __cinit__(self, vector[int]& data):
+        self.vec.swap(data)
+        self.view_count = 0
+
+    # now implement the buffer protocol for the class
+    # which makes it generally useful to anything that expects an array
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        # relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class
+        cdef Py_ssize_t itemsize = sizeof(self.vec[0])
+
+        self.shape[1] = self.vec.size()
+        self.shape[0] = 1
+        self.strides[1] = <Py_ssize_t>(  <char *>&(self.vec[1]) - <char *>&(self.vec[0]))
+        self.strides[0] = self.vec.size() * self.strides[1]
+        buffer.buf = <char *>&(self.vec[0])
+        buffer.format = 'i'
+        buffer.internal = NULL
+        buffer.itemsize = itemsize
+        buffer.len = self.vec.size() * itemsize   # product(shape) * itemsize
+        buffer.ndim = 2
+        buffer.obj = self
+        buffer.readonly = 0
+        buffer.shape = self.shape
+        buffer.strides = self.strides
+        buffer.suboffsets = NULL
+        self.view_count += 1
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        self.view_count -= 1
+
+
+# define a Cython array wrapper class to hold a C++ vector of doubles, adhering to numpy's buffer protocol:
+cdef class ArrayWrapper_double:
+    # constructor and destructor are fairly unimportant now since
+    # vec will be destroyed automatically.
+
+    def __cinit__(self, vector[double]& data):
+        self.vec.swap(data)
+        self.view_count = 0
+
+    # now implement the buffer protocol for the class
+    # which makes it generally useful to anything that expects an array
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        # relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class
+        cdef Py_ssize_t itemsize = sizeof(self.vec[0])
+
+        self.shape[1] = self.vec.size()
+        self.shape[0] = 1
+        self.strides[1] = <Py_ssize_t>(  <char *>&(self.vec[1]) - <char *>&(self.vec[0]))
+        self.strides[0] = self.vec.size() * self.strides[1]
+        buffer.buf = <char *>&(self.vec[0])
+        buffer.format = 'd'
+        buffer.internal = NULL
+        buffer.itemsize = itemsize
+        buffer.len = self.vec.size() * itemsize   # product(shape) * itemsize
+        buffer.ndim = 2
+        buffer.obj = self
+        buffer.readonly = 0
+        buffer.shape = self.shape
+        buffer.strides = self.strides
+        buffer.suboffsets = NULL
+        self.view_count += 1
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        self.view_count -= 1
diff --git a/sparse_dot_topn/awesome_cossim_topn.py b/sparse_dot_topn/awesome_cossim_topn.py
index baa14fbc..380c6e6e 100644
--- a/sparse_dot_topn/awesome_cossim_topn.py
+++ b/sparse_dot_topn/awesome_cossim_topn.py
@@ -2,6 +2,8 @@
 import numpy as np
 from scipy.sparse import csr_matrix
 from scipy.sparse import isspmatrix_csr
+from _ast import Continue
+from numpy import indices
 
 if sys.version_info[0] >= 3:
     from sparse_dot_topn import sparse_dot_topn as ct
@@ -12,7 +14,7 @@
 
 
 def awesome_cossim_topn(
-        A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1, scout_nnz=False, return_best_ntop=False):
+        A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1, return_best_ntop=False):
     """
     This function will return a matrix C in CSR format, where
     C = [sorted top n results > lower_bound for each row of A * B].
@@ -26,9 +28,6 @@ def awesome_cossim_topn(
         lower_bound: a threshold that the element of A*B must be greater than
         use_threads: use multi-thread or not
         n_jobs: number of thread, must be >= 1
-        scout_nnz: (default: False) this is mainly for testing purposes. if 
-                   True, will force a memory-size determination before computing
-                   the results.
         return_best_ntop: (default: False) if True, will return best_ntop together 
                           with C as a tuple: (C, best_ntop)
 
@@ -40,6 +39,14 @@ def awesome_cossim_topn(
 
     N.B. if A and B are not in CSR format, they will be converted to CSR
     """
+    def try_malloc(sz: int, idx_dtype, data_dtype) -> bool:
+        try:
+            ind_arr = np.empty(sz, dtype=idx_dtype)
+            dat_arr = np.empty(sz, dtype=data_dtype)
+            return True
+        except MemoryError:
+            return False
+        
     if not isspmatrix_csr(A):
         A = A.tocsr()
     if not isspmatrix_csr(B):
@@ -67,50 +74,24 @@ def awesome_cossim_topn(
         else:
             return output
 
-    # filled matrices from here on
-    indptr = np.empty(M+1, dtype=idx_dtype)
-    try:
-        indices = np.empty(nnz_max, dtype=idx_dtype)
-        data = np.empty(nnz_max, dtype=A.dtype)
-        if scout_nnz: raise MemoryError    # This is mainly for testing purposes
-    except MemoryError:
-        # if scout_nnz: print('Exception raised! Continuing ...', flush=True)
-        # It is likely you are here because nnz_max is too large. But don't give up just yet! 
-        # sparse_dot_topn will go ahead and count the exact amount of memory required.
-        if not use_threads:
-            
-            nnz = ct.sparse_dot_only_nnz(M, N, np.asarray(A.indptr, dtype=idx_dtype),
-                np.asarray(A.indices, dtype=idx_dtype),
-                A.data,
-                np.asarray(B.indptr, dtype=idx_dtype),
-                np.asarray(B.indices, dtype=idx_dtype),
-                B.data,
-                ntop, lower_bound
-            )
-            
-        else:
+    indptr = np.empty(M + 1, dtype=idx_dtype)
+    
+    # reduce nnz_max if too large to fit in available memory:
+    while (not try_malloc(nnz_max, idx_dtype, A.dtype)):
+        nnz_max = nnz_max//2
 
-            nnz = ct_thread.sparse_dot_only_nnz_threaded(
-                M, N, np.asarray(A.indptr, dtype=idx_dtype),
-                np.asarray(A.indices, dtype=idx_dtype),
-                A.data,
-                np.asarray(B.indptr, dtype=idx_dtype),
-                np.asarray(B.indices, dtype=idx_dtype),
-                B.data,
-                ntop, lower_bound, n_jobs
-            )
-            
-        nnz = max(1, nnz)
-        indices = np.empty(nnz, dtype=idx_dtype)
-        data = np.empty(nnz, dtype=A.dtype)
-        
-    # no exception was raised; then use old function (as it is expected to be the fastest)
+    # take a chance on high matrix-sparsity and reduce further:
+    nnz_max = max(M, nnz_max//16)
+    
+    # filled matrices from here on
+    indices = np.empty(nnz_max, dtype=idx_dtype)
+    data = np.empty(nnz_max, dtype=A.dtype)
     
     best_ntop_arr = np.full(1, 0, dtype=idx_dtype)
     
     if not use_threads:
     
-        ct.sparse_dot_topn_extd(
+        alt_indices, alt_data = ct.sparse_dot_topn_extd(
             M, N, np.asarray(A.indptr, dtype=idx_dtype),
             np.asarray(A.indices, dtype=idx_dtype),
             A.data,
@@ -127,7 +108,7 @@ def awesome_cossim_topn(
             err_str = 'Whenever you select the multi-thread mode, n_job must be greater than or equal to 1!'
             raise ValueError(err_str)
 
-        ct_thread.sparse_dot_topn_extd_threaded(
+        alt_indices, alt_data = ct_thread.sparse_dot_topn_extd_threaded(
             M, N, np.asarray(A.indptr, dtype=idx_dtype),
             np.asarray(A.indices, dtype=idx_dtype),
             A.data,
@@ -139,6 +120,10 @@ def awesome_cossim_topn(
             indptr, indices, data, best_ntop_arr, n_jobs
         )
     
+    if alt_indices is not None:
+        indices = alt_indices
+        data = alt_data
+        
     # prepare and return the output:
     output = csr_matrix((data, indices, indptr), shape=(M, N))
     if return_best_ntop:
diff --git a/sparse_dot_topn/example/comparison2.py b/sparse_dot_topn/example/comparison2.py
index c54a2ff8..c79cb45f 100644
--- a/sparse_dot_topn/example/comparison2.py
+++ b/sparse_dot_topn/example/comparison2.py
@@ -9,9 +9,9 @@
 from scipy.sparse import coo_matrix
 from sparse_dot_topn import awesome_cossim_topn  # noqa: F401
 
-df = pd.DataFrame(columns=['sample', '#threads', 'python', '+scout', '%inc'])
+df = pd.DataFrame(columns=['sample', '#threads', 'python'])
 
-N = 1000
+N = 4000
 thresh = 0.01
 
 nr_vocab = int(26**3)
@@ -32,7 +32,6 @@
 print('', flush=True)
 
 rng1 = np.random.RandomState(42)
-rng2 = np.random.RandomState(43)
 
 n_matrix_pairs = 2**4
 nnz_arr = np.full(n_matrix_pairs, 0)
@@ -41,14 +40,14 @@
 for it in range(n_matrix_pairs):
     
     row = rng1.randint(n_samples, size=nnz_a)
-    cols = rng2.randint(nr_vocab, size=nnz_a)
+    cols = rng1.randint(nr_vocab, size=nnz_a)
     data = rng1.rand(nnz_a)
     
     a_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab))
     a = a_sparse.tocsr()
     
     row = rng1.randint(n_duplicates, size=nnz_b)
-    cols = rng2.randint(nr_vocab, size=nnz_b)
+    cols = rng1.randint(nr_vocab, size=nnz_b)
     data = rng1.rand(nnz_b)
     
     b_sparse = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab))
@@ -60,7 +59,8 @@
     print('', flush=True)
     nnz_arr[it] = len(C.data)
     ntop_arr[it] = C_ntop
-    
+    del C
+    del C_ntop
     
     # top 5 results per row
     
@@ -69,112 +69,88 @@
     rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh)',
                         number=3,
                         globals=globals())
-    rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, scout_nnz=True)',
-                        number=3,
-                        globals=globals())
-    df.loc[r] = [it, 0, rtv, rtv2, 100.*(rtv2 - rtv)/rtv]
+    df.loc[r] = [it, 0, rtv]
     r += 1
-    print('sample\t\tpython\t\t+scout', flush=True)
-    print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+    print('sample\t\tpython', flush=True)
+    print(f'{it}\t\t{rtv:7.4f}', flush=True)
     
     print("Threaded function with 1 thread")
     
     rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1)',
                         number=3,
                         globals=globals())
-    rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1, scout_nnz=True)',
-                        number=3,
-                        globals=globals())
-    df.loc[r] = [it, 1, rtv, rtv2, 100.*(rtv2 - rtv)/rtv]
+    df.loc[r] = [it, 1, rtv]
     r += 1
-    print('sample\t\tpython\t\t+scout', flush=True)
-    print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+    print('sample\t\tpython', flush=True)
+    print(f'{it}\t\t{rtv:7.4f}', flush=True)
     
     print("Threaded function with 2 threads")
     
     rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2)',
                         number=3,
                         globals=globals())
-    rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2, scout_nnz=True)',
-                        number=3,
-                        globals=globals())
-    df.loc[r] = [it, 2, rtv, rtv2, 100.*(rtv2 - rtv)/rtv]
+    df.loc[r] = [it, 2, rtv]
     r += 1
-    print('sample\t\tpython\t\t+scout', flush=True)
-    print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+    print('sample\t\tpython', flush=True)
+    print(f'{it}\t\t{rtv:7.4f}', flush=True)
     
     print("Threaded function with 3 threads")
     
     rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3)',
                         number=3,
                         globals=globals())
-    rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3, scout_nnz=True)',
-                        number=3,
-                        globals=globals())
-    df.loc[r] = [it, 3, rtv, rtv2, 100.*(rtv2 - rtv)/rtv]
+    df.loc[r] = [it, 3, rtv]
     r += 1
-    print('sample\t\tpython\t\t+scout', flush=True)
-    print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+    print('sample\t\tpython', flush=True)
+    print(f'{it}\t\t{rtv:7.4f}', flush=True)
     
     print("Threaded function with 4 threads")
     
     rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4)',
                         number=3,
                         globals=globals())
-    rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4, scout_nnz=True)',
-                        number=3,
-                        globals=globals())
-    df.loc[r] = [it, 4, rtv, rtv2, 100.*(rtv2 - rtv)/rtv]
+    df.loc[r] = [it, 4, rtv]
     r += 1
-    print('sample\t\tpython\t\t+scout', flush=True)
-    print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+    print('sample\t\tpython', flush=True)
+    print(f'{it}\t\t{rtv:7.4f}', flush=True)
     
     print("Threaded function with 5 threads")
     
     rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5)',
                         number=3,
                         globals=globals())
-    rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5, scout_nnz=True)',
-                        number=3,
-                        globals=globals())
-    df.loc[r] = [it, 5, rtv, rtv2, 100.*(rtv2 - rtv)/rtv]
+    df.loc[r] = [it, 5, rtv]
     r += 1
-    print('sample\t\tpython\t\t+scout', flush=True)
-    print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+    print('sample\t\tpython', flush=True)
+    print(f'{it}\t\t{rtv:7.4f}', flush=True)
     
     print("Threaded function with 6 threads")
     
     rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6)',
                         number=3,
                         globals=globals())
-    rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6, scout_nnz=True)',
-                        number=3,
-                        globals=globals())
-    df.loc[r] = [it, 6, rtv, rtv2, 100.*(rtv2 - rtv)/rtv]
+    df.loc[r] = [it, 6, rtv]
     r += 1
-    print('sample\t\tpython\t\t+scout', flush=True)
-    print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+    print('sample\t\tpython', flush=True)
+    print(f'{it}\t\t{rtv:7.4f}', flush=True)
     
     print("Threaded function with 7 threads")
     
     rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7)',
                         number=3,
                         globals=globals())
-    rtv2 = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7, scout_nnz=True)',
-                        number=3,
-                        globals=globals())
-    df.loc[r] = [it, 7, rtv, rtv2, 100.*(rtv2 - rtv)/rtv]
+    df.loc[r] = [it, 7, rtv]
     r += 1
-    print('sample\t\tpython\t\t+scout', flush=True)
-    print(f'{it}\t\t{rtv:7.4f}\t\t{rtv2:7.4f}', flush=True)
+    print('sample\t\tpython', flush=True)
+    print(f'{it}\t\t{rtv:7.4f}', flush=True)
     
     print('')
     print(f'nnz(A*B) = {nnz_arr[:(it + 1)].mean()} +/- {nnz_arr[:(it + 1)].std()}')
     print(f'ntop(A*B) = {ntop_arr[:(it + 1)].mean()} +/- {ntop_arr[:(it + 1)].std()}')
     print('')
     df = df.astype({
-        'sample': np.int64, '#threads': np.int64, 'python': np.float64, '+scout': np.float64, '%inc': np.float64})
-    results = df.groupby('#threads', as_index=True, sort=True)[['python', '+scout', '%inc']].mean()
+        'sample': np.int64, '#threads': np.int64, 'python': np.float64})
+    results = df.groupby('#threads', as_index=True, sort=True)[['python']].mean()
     
     print(results)
     print('')
diff --git a/sparse_dot_topn/example/comparison3.py b/sparse_dot_topn/example/comparison3.py
new file mode 100644
index 00000000..b1b9412c
--- /dev/null
+++ b/sparse_dot_topn/example/comparison3.py
@@ -0,0 +1,61 @@
+"""
+This file compare our boosting method with calling scipy+numpy function directly
+"""
+
+from __future__ import print_function
+import timeit
+import time
+import numpy as np
+import pandas as pd
+from scipy.sparse import load_npz
+from sparse_dot_topn import awesome_cossim_topn  # noqa: F401
+
+df = pd.DataFrame(columns=['sample', '#threads', 'python', '+scout', '%inc'])
+
+a = load_npz('sparse_matrix_A.npz')
+b = load_npz('sparse_matrix_B.npz')
+
+# tic = time.perf_counter()
+# p = np.random.permutation(a.shape[0])
+# a = a[p]
+# toc = time.perf_counter()
+# print(f'shuffle(A) took {(toc - tic):0.4f} seconds', flush=True)
+
+
+N = b.shape[1]
+thresh = 0.8
+
+nr_vocab = b.shape[0]
+density_A = len(a.data)/(a.shape[0]*a.shape[1]) 
+density_B = len(b.data)/(b.shape[0]*b.shape[1]) 
+n_samples = a.shape[0]
+n_duplicates = b.shape[1]
+nnz_a = len(a.data)
+nnz_b = len(b.data)
+
+print(f'ntop = {N}', flush=True)
+print(f'threshold = {thresh}', flush=True)
+print(f'density(A) = {density_A}', flush=True)
+print(f'density(B) = {density_B}', flush=True)
+print(f'nr_vocab = {nr_vocab}', flush=True)
+print(f'n_samples = {n_samples}', flush=True)
+print(f'n_duplicates = {n_duplicates}', flush=True)
+print(f'nnz_A = {nnz_a}', flush=True)
+print(f'nnz_B = {nnz_b}', flush=True)
+print('', flush=True)
+
+n_matrix_pairs = 1
+nnz_arr = np.full(n_matrix_pairs, 0)
+ntop_arr = np.full(n_matrix_pairs, 0)
+r = 0
+it = 0
+
+tic = time.perf_counter()
+C, C_ntop = awesome_cossim_topn(a, b, N, thresh, use_threads=True, n_jobs = 7, return_best_ntop=True)
+toc = time.perf_counter()
+
+print('scout_nnz=True, use_threads=True, n_jobs = 7')
+print(f'nnz(A*B) = {len(C.data)}', flush=True)
+print(f'ntop(A*B) = {C_ntop}', flush=True)
+print(f'duration(A*B) = {(toc - tic):0.4f}', flush=True)
+
diff --git a/sparse_dot_topn/sparse_dot_topn.pyx b/sparse_dot_topn/sparse_dot_topn.pyx
index 9728c467..974b4ce9 100644
--- a/sparse_dot_topn/sparse_dot_topn.pyx
+++ b/sparse_dot_topn/sparse_dot_topn.pyx
@@ -20,6 +20,7 @@
 # distutils: language = c++
 
 from libcpp.vector cimport vector
+from array_wrappers cimport ArrayWrapper_int, ArrayWrapper_double
 
 cimport numpy as np
 import numpy as np
@@ -45,7 +46,7 @@ cdef extern from "sparse_dot_topn_source.h":
 		double Cx[]
 	);
 
-	cdef void sparse_dot_topn_extd_source(
+	cdef int sparse_dot_topn_extd_source(
 		int n_row,
 		int n_col,
 		int Ap[],
@@ -59,26 +60,12 @@ cdef extern from "sparse_dot_topn_source.h":
 		int Cp[],
 		int Cj[],
 		double Cx[],
+		vector[int]* alt_Cj,
+		vector[double]* alt_Cx,
+		int nnz_max,
 		int* nminmax
 	);
 
-	cdef void sparse_dot_free_source(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		double Ax[],
-		int Bp[],
-		int Bj[],
-		double Bx[],
-		int ntop,
-		double lower_bound,
-		int Cp[],
-		vector[int]* Cj,
-		vector[double]* Cx,
-		int* n_minmax
-	);
-
 	cdef int sparse_dot_only_nnz_source(
 		int n_row,
 		int n_col,
@@ -158,7 +145,7 @@ cpdef sparse_dot_topn_extd(
 	np.ndarray[int, ndim=1] c_indptr,
 	np.ndarray[int, ndim=1] c_indices,
 	np.ndarray[double, ndim=1] c_data,
-	np.ndarray[int, ndim=1] nminmax,
+	np.ndarray[int, ndim=1] nminmax
 ):
 	"""
 	Cython glue function to call sparse_dot_topn_extd C++
@@ -185,6 +172,13 @@ cpdef sparse_dot_topn_extd(
 		nminmax: The maximum number of elements per row of C 
 				 (assuming ntop = n_col)
 
+	Returned output:
+		c_indices, c_data: CSR expression of matrix C.  These will 
+						be returned instead of output by reference
+						if the preset sizes of c_indices and 
+						c_data are too small to hold all the 
+						results.
+
 	N.B. A and B must be CSR format!!!
 		 The type of input numpy array must be aligned with types
 		 of C++ function arguments!
@@ -200,12 +194,26 @@ cpdef sparse_dot_topn_extd(
 	cdef int* Cj = &c_indices[0]
 	cdef double* Cx = &c_data[0]
 	cdef int* n_minmax = &nminmax[0]
-
-	sparse_dot_topn_extd_source(
-		n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax
+	
+	cdef nnz_max = len(c_indices)
+	
+	cdef vector[int] vCj;
+	cdef vector[double] vCx;
+
+	cdef int nnz_max_is_too_small = sparse_dot_topn_extd_source(
+		n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, &vCj, &vCx, nnz_max, n_minmax
 	)
-	return
-
+	
+	if nnz_max_is_too_small:
+		
+		c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0)
+		c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0)
+	
+		return c_indices, c_data		
+	
+	else:
+		
+		return None, None
 
 cpdef sparse_dot_only_nnz(
 	int n_row,
diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/sparse_dot_topn/sparse_dot_topn_parallel.cpp
index 8d8fadc6..2317e1ba 100644
--- a/sparse_dot_topn/sparse_dot_topn_parallel.cpp
+++ b/sparse_dot_topn/sparse_dot_topn_parallel.cpp
@@ -336,7 +336,7 @@ void inner_sparse_dot_topn_extd(
 	}
 }
 
-void sparse_dot_topn_extd_parallel(
+int sparse_dot_topn_extd_parallel(
 		int n_row,
 		int n_col,
 		int Ap[],
@@ -350,6 +350,9 @@ void sparse_dot_topn_extd_parallel(
 		int Cp[],
 		int Cj[],
 		double Cx[],
+		std::vector<int>* alt_Cj,
+		std::vector<double>* alt_Cx,
+		int nnz_max,
 		int *n_minmax,
 		int n_jobs
 )
@@ -391,6 +394,23 @@ void sparse_dot_topn_extd_parallel(
 	start_points[0] = 0;
 	partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1);
 
+	int* Cj_container;
+	double* Cx_container;
+
+	int total = start_points.back();
+	int nnz_max_is_too_small = (nnz_max < total);
+
+	if (nnz_max_is_too_small) {
+		alt_Cj->resize(total);
+		alt_Cx->resize(total);
+		Cj_container = &((*alt_Cj)[0]);
+		Cx_container = &((*alt_Cx)[0]);
+	}
+	else {
+		Cj_container = Cj;
+		Cx_container = Cx;
+	}
+
 	Cp[0] = 0;
 	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
 
@@ -399,8 +419,8 @@ void sparse_dot_topn_extd_parallel(
 				job_ranges[job_nr],
 				Cp,
 				start_points[job_nr],
-				Cj,
-				Cx,
+				Cj_container,
+				Cx_container,
 				&real_candidates[job_nr],
 				&row_nnz[job_nr]
 		);
@@ -408,6 +428,8 @@ void sparse_dot_topn_extd_parallel(
 
 	for (int job_nr = 0; job_nr < n_jobs; job_nr++)
 		thread_list[job_nr].join();
+
+	return nnz_max_is_too_small;
 }
 
 void inner_sparse_nnz_only(
diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.h b/sparse_dot_topn/sparse_dot_topn_parallel.h
index 0099917e..3aeb11e0 100644
--- a/sparse_dot_topn/sparse_dot_topn_parallel.h
+++ b/sparse_dot_topn/sparse_dot_topn_parallel.h
@@ -40,7 +40,7 @@ extern void sparse_dot_topn_parallel(
 		int n_jobs
 );
 
-extern void sparse_dot_topn_extd_parallel(
+extern int sparse_dot_topn_extd_parallel(
 		int n_row,
 		int n_col,
 		int Ap[],
@@ -54,6 +54,9 @@ extern void sparse_dot_topn_extd_parallel(
 		int Cp[],
 		int Cj[],
 		double Cx[],
+		std::vector<int>* alt_Cj,
+		std::vector<double>* alt_Cx,
+		int nnz_max,
 		int* n_minmax,
 		int n_jobs
 );
diff --git a/sparse_dot_topn/sparse_dot_topn_source.cpp b/sparse_dot_topn/sparse_dot_topn_source.cpp
index 0cc14e62..c908bbec 100644
--- a/sparse_dot_topn/sparse_dot_topn_source.cpp
+++ b/sparse_dot_topn/sparse_dot_topn_source.cpp
@@ -151,14 +151,22 @@ void sparse_dot_topn_source(
 
 		ntop: n top results
 		lower_bound: a threshold that the element of A*B must greater than
+		nnz_max: the size of the memory allocated for the results Cj and Cx.  If
+				nnz_max is found to be too small during the computation, then the
+				results will be placed in vectors alt_Cj and alt_Cx instead
 
 	Output by reference:
 		Cp, Cj, Cx: CSR expression of C matrix
 		n_minmax: The maximum number of elements per row of C (assuming ntop = n_col)
+				alt_Cj, alt_Cx: CSR expression of C matrix as vectors.  These will
+				contain the output only if nnz_max is found to be too small
 
+	Returned output:
+		nnz_max_is_too_small: int 1 or 0 depending on whether nnz_max was found to be
+							too small or not respectively
 	N.B. A and B must be CSR format!!!
 */
-void sparse_dot_topn_extd_source(
+int sparse_dot_topn_extd_source(
 		int n_row,
 		int n_col,
 		int Ap[],
@@ -172,6 +180,9 @@ void sparse_dot_topn_extd_source(
 		int Cp[],
 		int Cj[],
 		double Cx[], 	//data of C
+		std::vector<int>* alt_Cj,
+		std::vector<double>* alt_Cx,
+		int nnz_max,
 		int* n_minmax
 )
 {
@@ -181,6 +192,7 @@ void sparse_dot_topn_extd_source(
 	std::vector<candidate> candidates;
 
 	int nnz = 0;
+	int nnz_max_is_too_small = 0;
 
 	Cp[0] = 0;
 	*n_minmax = 0;
@@ -234,16 +246,32 @@ void sparse_dot_topn_extd_source(
 		} else {
 			std::sort(candidates.begin(), candidates.end(), candidate_cmp);
 		}
-
-		for(int a=0; a < len; a++){
-			Cj[nnz] = candidates[a].index;
-			Cx[nnz] = candidates[a].value;
-			nnz++;
+		if (len + nnz > nnz_max){
+			if (!nnz_max_is_too_small){
+				nnz_max_is_too_small = true;
+				alt_Cj->resize(nnz);
+				alt_Cx->resize(nnz);
+				std::copy(Cj, Cj + nnz, alt_Cj->data());
+				std::copy(Cx, Cx + nnz, alt_Cx->data());
+			}
+			for(int a = 0; a < len; a++){
+				alt_Cj->push_back(candidates[a].index);
+				alt_Cx->push_back(candidates[a].value);
+				nnz++;
+			}
+		}
+		else {
+			for(int a = 0; a < len; a++){
+				Cj[nnz] = candidates[a].index;
+				Cx[nnz] = candidates[a].value;
+				nnz++;
+			}
 		}
 		candidates.clear();
 
 		Cp[i+1] = nnz;
 	}
+	return nnz_max_is_too_small;
 }
 
 /*
diff --git a/sparse_dot_topn/sparse_dot_topn_source.h b/sparse_dot_topn/sparse_dot_topn_source.h
index 7975a75b..0ac85127 100644
--- a/sparse_dot_topn/sparse_dot_topn_source.h
+++ b/sparse_dot_topn/sparse_dot_topn_source.h
@@ -44,7 +44,7 @@ extern void sparse_dot_topn_source(
 		double Cx[]		//data of C
 );
 
-extern void sparse_dot_topn_extd_source(
+extern int sparse_dot_topn_extd_source(
 		int n_row,
 		int n_col,
 		int Ap[],
@@ -58,6 +58,9 @@ extern void sparse_dot_topn_extd_source(
 		int Cp[],
 		int Cj[],
 		double Cx[], 	//data of C
+		std::vector<int>* alt_Cj,
+		std::vector<double>* alt_Cx,
+		int nnz_max,
 		int* n_minmax
 );
 
diff --git a/sparse_dot_topn/sparse_dot_topn_threaded.pyx b/sparse_dot_topn/sparse_dot_topn_threaded.pyx
index ad95fbb9..e20aaaaf 100644
--- a/sparse_dot_topn/sparse_dot_topn_threaded.pyx
+++ b/sparse_dot_topn/sparse_dot_topn_threaded.pyx
@@ -20,6 +20,7 @@
 # distutils: language = c++
 
 from libcpp.vector cimport vector
+from array_wrappers cimport ArrayWrapper_int, ArrayWrapper_double
 
 cimport numpy as np
 import numpy as np
@@ -47,7 +48,7 @@ cdef extern from "sparse_dot_topn_parallel.h":
 		int n_jobs
 	);
 
-	cdef void sparse_dot_topn_extd_parallel(
+	cdef int sparse_dot_topn_extd_parallel(
 		int n_row,
 		int n_col,
 		int Ap[],
@@ -61,6 +62,9 @@ cdef extern from "sparse_dot_topn_parallel.h":
 		int Cp[],
 		int Cj[],
 		double Cx[],
+		vector[int]* alt_Cj,
+		vector[double]* alt_Cx,
+		int nnz_max,
 		int* n_minmax,
 		int n_jobs
 	);
@@ -139,11 +143,26 @@ cpdef sparse_dot_topn_extd_threaded(
 	cdef int* Cj = &c_indices[0]
 	cdef double* Cx = &c_data[0]
 	cdef int* n_minmax = &nminmax[0]
-
-	sparse_dot_topn_extd_parallel(
-		n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_minmax, n_jobs
+	
+	cdef nnz_max = len(c_indices)
+	
+	cdef vector[int] vCj;
+	cdef vector[double] vCx;
+
+	cdef int nnz_max_is_too_small = sparse_dot_topn_extd_parallel(
+		n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, &vCj, &vCx, nnz_max, n_minmax, n_jobs
 	)
-	return
+	
+	if nnz_max_is_too_small:
+		
+		c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0)
+		c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0)
+	
+		return c_indices, c_data
+	
+	else:
+		
+		return None, None
 
 cpdef sparse_dot_only_nnz_threaded(
 	int n_row,
diff --git a/sparse_dot_topn/test/test_awesome_cossim_topn.py b/sparse_dot_topn/test/test_awesome_cossim_topn.py
index 1a631179..5560ccc3 100644
--- a/sparse_dot_topn/test/test_awesome_cossim_topn.py
+++ b/sparse_dot_topn/test/test_awesome_cossim_topn.py
@@ -11,7 +11,6 @@
 
 PRUNE_THRESHOLD = 0.1
 NUM_CANDIDATES = 3
-SCOUT_NNZ = True
 USE_THREADS = True
 MAX_N_PROCESSES = min(8, multiprocessing.cpu_count()) - 1
 
@@ -38,7 +37,6 @@ def get_n_top_sparse(mat, n_top=10):
 def helper_awesome_cossim_topn_dense(
         a_dense,
         b_dense,
-        scout_nnz=False,
         use_threads=False,
         n_jobs=1
     ):
@@ -58,7 +56,6 @@ def helper_awesome_cossim_topn_dense(
     awesome_result = awesome_cossim_topn(
         a_csr, b_csr_t, len(b_dense),
         0.0,
-        scout_nnz=scout_nnz, 
         use_threads=use_threads,
         n_jobs=n_jobs
     )
@@ -67,7 +64,6 @@ def helper_awesome_cossim_topn_dense(
         b_csr_t,
         NUM_CANDIDATES,
         0.0,
-        scout_nnz=scout_nnz,
         use_threads=use_threads,
         n_jobs=n_jobs
     )
@@ -79,7 +75,6 @@ def helper_awesome_cossim_topn_dense(
         b_csr_t,
         len(b_dense),
         PRUNE_THRESHOLD,
-        scout_nnz=scout_nnz,
         use_threads=use_threads,
         n_jobs=n_jobs
     )
@@ -88,7 +83,6 @@ def helper_awesome_cossim_topn_dense(
         b_csr_t,
         NUM_CANDIDATES,
         PRUNE_THRESHOLD,
-        scout_nnz=scout_nnz,
         use_threads=use_threads,
         n_jobs=n_jobs
     )
@@ -119,7 +113,6 @@ def helper_awesome_cossim_topn_sparse(
         a_sparse,
         b_sparse,
         flag=True,
-        scout_nnz=False,
         use_threads=False,
         n_jobs=1
     ):
@@ -141,7 +134,6 @@ def helper_awesome_cossim_topn_sparse(
         b_csr_t,
         b_sparse.shape[0],
         0.0,
-        scout_nnz=scout_nnz,
         use_threads=use_threads,
         n_jobs=n_jobs
     )
@@ -150,7 +142,6 @@ def helper_awesome_cossim_topn_sparse(
         b_csr_t,
         NUM_CANDIDATES,
         0.0,
-        scout_nnz=scout_nnz,
         use_threads=use_threads,
         n_jobs=n_jobs
     )
@@ -162,7 +153,6 @@ def helper_awesome_cossim_topn_sparse(
         b_csr_t,
         b_sparse.shape[0],
         PRUNE_THRESHOLD,
-        scout_nnz=scout_nnz,
         use_threads=use_threads,
         n_jobs=n_jobs
     )
@@ -171,7 +161,6 @@ def helper_awesome_cossim_topn_sparse(
         b_csr_t,
         NUM_CANDIDATES,
         PRUNE_THRESHOLD,
-        scout_nnz=scout_nnz,
         use_threads=use_threads,
         n_jobs=n_jobs
     )
@@ -216,17 +205,9 @@ def test_awesome_cossim_topn_manually():
                [0.6, 0.1, 0.2, 0.8, 0.1],
                [0.9, 0.1, 0.6, 0.4, 0.3]]
     helper_awesome_cossim_topn_dense(a_dense, b_dense)
-    helper_awesome_cossim_topn_dense(a_dense, b_dense, scout_nnz=SCOUT_NNZ)
     for process in range(MAX_N_PROCESSES):
         n_jobs = process + 1
         helper_awesome_cossim_topn_dense(a_dense, b_dense, use_threads=USE_THREADS, n_jobs=n_jobs)
-        helper_awesome_cossim_topn_dense(
-            a_dense,
-            b_dense,
-            scout_nnz=SCOUT_NNZ,
-            use_threads=USE_THREADS,
-            n_jobs=n_jobs
-        )
 
     # boundary checking, there is no matching at all in this case
     c_dense = [[0.2, 0.1, 0.3, 0, 0],
@@ -240,17 +221,9 @@ def test_awesome_cossim_topn_manually():
                [0, 0, 0, 0.1, 0.3],
                [0, 0, 0, 0.7, 0.5]]
     helper_awesome_cossim_topn_dense(c_dense, d_dense)
-    helper_awesome_cossim_topn_dense(c_dense, d_dense, scout_nnz=SCOUT_NNZ)
     for process in range(MAX_N_PROCESSES):
         n_jobs = process + 1
         helper_awesome_cossim_topn_dense(c_dense, d_dense, use_threads=USE_THREADS, n_jobs=n_jobs)
-        helper_awesome_cossim_topn_dense(
-            c_dense,
-            d_dense,
-            scout_nnz=SCOUT_NNZ,
-            use_threads=USE_THREADS,
-            n_jobs=n_jobs
-        )
 
 
 @pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero")
@@ -264,17 +237,9 @@ def test_awesome_cossim_top_one_zeros():
         a_sparse = csr_matrix(np.zeros((1, nr_vocab)))
         b_sparse = rand(800, nr_vocab, density=density, format='csr')
         helper_awesome_cossim_topn_sparse(a_sparse, b_sparse)
-        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, scout_nnz=SCOUT_NNZ)
         for process in range(MAX_N_PROCESSES):
             n_jobs = process + 1
             helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, use_threads=USE_THREADS, n_jobs=n_jobs)
-            helper_awesome_cossim_topn_sparse(
-                a_sparse,
-                b_sparse,
-                scout_nnz=SCOUT_NNZ,
-                use_threads=USE_THREADS,
-                n_jobs=n_jobs
-            )
 
 
 @pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero")
@@ -288,17 +253,9 @@ def test_awesome_cossim_top_all_zeros():
         a_sparse = csr_matrix(np.zeros((2, nr_vocab)))
         b_sparse = rand(800, nr_vocab, density=density, format='csr')
         helper_awesome_cossim_topn_sparse(a_sparse, b_sparse)
-        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, scout_nnz=SCOUT_NNZ)
         for process in range(MAX_N_PROCESSES):
             n_jobs = process + 1
             helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, use_threads=USE_THREADS, n_jobs=n_jobs)
-            helper_awesome_cossim_topn_sparse(
-                a_sparse,
-                b_sparse,
-                scout_nnz=SCOUT_NNZ,
-                use_threads=USE_THREADS,
-                n_jobs=n_jobs
-            )
 
 
 @pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero")
@@ -311,18 +268,9 @@ def test_awesome_cossim_top_small_matrix():
         a_sparse = rand(300, nr_vocab, density=density, format='csr')
         b_sparse = rand(800, nr_vocab, density=density, format='csr')
         helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False)
-        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, scout_nnz=SCOUT_NNZ)
         for process in range(MAX_N_PROCESSES):
             n_jobs = process + 1
             helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, use_threads=USE_THREADS, n_jobs=n_jobs)
-            helper_awesome_cossim_topn_sparse(
-                a_sparse,
-                b_sparse,
-                False,
-                scout_nnz=SCOUT_NNZ,
-                use_threads=USE_THREADS,
-                n_jobs=n_jobs
-            )
 
 
 @pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero")
@@ -360,15 +308,6 @@ def test_awesome_cossim_top_large_matrix():
         b_sparse = b_sparse.tocsr()
 
         helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False)
-        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, scout_nnz=SCOUT_NNZ)
         for process in range(MAX_N_PROCESSES):
             n_jobs = process + 1
             helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, use_threads=USE_THREADS, n_jobs=n_jobs)
-            helper_awesome_cossim_topn_sparse(
-                a_sparse,
-                b_sparse,
-                False,
-                scout_nnz=SCOUT_NNZ,
-                use_threads=USE_THREADS,
-                n_jobs=n_jobs
-            )

From 0b3bc8a5cd9990a947e5d75b0b4a550bc42d74f4 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Mon, 3 May 2021 11:25:42 +0200
Subject: [PATCH 16/29] tried vector reserve

---
 sparse_dot_topn/example/comparison3.py       |  2 -
 sparse_dot_topn/sparse_dot_topn_parallel.cpp | 64 +++++++++++++++-----
 sparse_dot_topn/sparse_dot_topn_source.cpp   |  1 +
 3 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/sparse_dot_topn/example/comparison3.py b/sparse_dot_topn/example/comparison3.py
index b1b9412c..c0bcf145 100644
--- a/sparse_dot_topn/example/comparison3.py
+++ b/sparse_dot_topn/example/comparison3.py
@@ -10,8 +10,6 @@
 from scipy.sparse import load_npz
 from sparse_dot_topn import awesome_cossim_topn  # noqa: F401
 
-df = pd.DataFrame(columns=['sample', '#threads', 'python', '+scout', '%inc'])
-
 a = load_npz('sparse_matrix_A.npz')
 b = load_npz('sparse_matrix_B.npz')
 
diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/sparse_dot_topn/sparse_dot_topn_parallel.cpp
index 2317e1ba..522f9e72 100644
--- a/sparse_dot_topn/sparse_dot_topn_parallel.cpp
+++ b/sparse_dot_topn/sparse_dot_topn_parallel.cpp
@@ -51,7 +51,37 @@ void distribute_load(
 	}
 }
 
-void inner_gather_function(
+void inner_gather_v2(
+		job_range_type job_range,
+		int Cp[],
+		int Cp_start,
+		int Cj[],
+		double Cx[],
+		std::vector<candidate>* real_candidates,
+		std::vector<int>* row_nnz
+)
+{
+	if (job_range.begin >= job_range.end) return;
+
+	int* nnz_begin = row_nnz->data();
+	int* nnz_end = nnz_begin + row_nnz->size();
+
+	int* Cp_begin = &Cp[job_range.begin + 1];
+
+	(*row_nnz)[0] += Cp_start;
+	std::partial_sum(nnz_begin, nnz_end, Cp_begin);
+
+	candidate* c_begin = real_candidates->data();
+	candidate* c_end = c_begin + real_candidates->size();
+
+	int* Cj_begin = &Cj[Cp_start];
+	double* Cx_begin = &Cx[Cp_start];
+
+	std::transform(c_begin, c_end, Cj_begin, [](candidate c) -> int { return c.index; });
+	std::transform(c_begin, c_end, Cx_begin, [](candidate c) -> double { return c.value; });
+}
+
+void inner_gather_v1(
 		job_range_type job_range,
 		int Cp[],
 		int Cp_start,
@@ -217,18 +247,18 @@ void sparse_dot_topn_parallel(
 		thread_list[job_nr].join();
 
 	// gather the results:
-	std::vector<int> start_points(n_jobs + 1);
-	start_points[0] = 0;
-	partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1);
+	std::vector<int> nnz_job_starts(n_jobs + 1);
+	nnz_job_starts[0] = 0;
+	partial_sum(sub_total.begin(), sub_total.end(), nnz_job_starts.begin() + 1);
 
 	Cp[0] = 0;
 	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
 
 		thread_list[job_nr] = std::thread(
-				inner_gather_function,
+				inner_gather_v1,
 				job_ranges[job_nr],
 				Cp,
-				start_points[job_nr],
+				nnz_job_starts[job_nr],
 				Cj,
 				Cx,
 				&real_candidates[job_nr],
@@ -254,13 +284,14 @@ void inner_sparse_dot_topn_extd(
 		std::vector<candidate>* real_candidates,
 		std::vector<int>* row_nnz,
 		int* total,
-		int* n_minmax
+		int* n_minmax,
+		int mem_sz_per_row
 )
 {
 	std::vector<int> next(n_col_inner,-1);
 	std::vector<double> sums(n_col_inner, 0);
 
-	real_candidates->reserve(job_range.end - job_range.begin);
+	real_candidates->reserve(mem_sz_per_row*(job_range.end - job_range.begin));
 
 	row_nnz->resize(job_range.end - job_range.begin);
 	int* row_nnz_ptr = row_nnz->data();
@@ -367,6 +398,8 @@ int sparse_dot_topn_extd_parallel(
 	std::vector<int> sub_total(n_jobs, 0);
 	std::vector<int> split_n_minmax(n_jobs, 0);
 
+	int mem_sz_per_row = std::max(1, (int) ceil(((double) nnz_max)/((double) n_row)));
+
 	std::vector<std::thread> thread_list(n_jobs);
 
 	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
@@ -380,7 +413,8 @@ int sparse_dot_topn_extd_parallel(
 				&real_candidates[job_nr],
 				&row_nnz[job_nr],
 				&sub_total[job_nr],
-				&split_n_minmax[job_nr]
+				&split_n_minmax[job_nr],
+				mem_sz_per_row
 		);
 	}
 
@@ -390,14 +424,14 @@ int sparse_dot_topn_extd_parallel(
 	// gather the results:
 	*n_minmax = *max_element(split_n_minmax.begin(), split_n_minmax.end());
 
-	std::vector<int> start_points(n_jobs + 1);
-	start_points[0] = 0;
-	partial_sum(sub_total.begin(), sub_total.end(), start_points.begin() + 1);
+	std::vector<int> nnz_job_starts(n_jobs + 1);
+	nnz_job_starts[0] = 0;
+	partial_sum(sub_total.begin(), sub_total.end(), nnz_job_starts.begin() + 1);
 
 	int* Cj_container;
 	double* Cx_container;
 
-	int total = start_points.back();
+	int total = nnz_job_starts.back();
 	int nnz_max_is_too_small = (nnz_max < total);
 
 	if (nnz_max_is_too_small) {
@@ -415,10 +449,10 @@ int sparse_dot_topn_extd_parallel(
 	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
 
 		thread_list[job_nr] = std::thread(
-				inner_gather_function,
+				inner_gather_v1,
 				job_ranges[job_nr],
 				Cp,
-				start_points[job_nr],
+				nnz_job_starts[job_nr],
 				Cj_container,
 				Cx_container,
 				&real_candidates[job_nr],
diff --git a/sparse_dot_topn/sparse_dot_topn_source.cpp b/sparse_dot_topn/sparse_dot_topn_source.cpp
index c908bbec..be987495 100644
--- a/sparse_dot_topn/sparse_dot_topn_source.cpp
+++ b/sparse_dot_topn/sparse_dot_topn_source.cpp
@@ -190,6 +190,7 @@ int sparse_dot_topn_extd_source(
 	std::vector<double> sums(n_col, 0);
 
 	std::vector<candidate> candidates;
+	candidates.reserve(n_col);
 
 	int nnz = 0;
 	int nnz_max_is_too_small = 0;

From 80d388bb1f1f08b1624737524a44c2e406f2586d Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Tue, 4 May 2021 23:03:18 +0200
Subject: [PATCH 17/29] fixed bug related to single-valued input Series

---
 .github/workflows/test.yml       | 28 ++++++++++++++++++++++++++++
 string_grouper/string_grouper.py | 18 ++++++++++--------
 2 files changed, 38 insertions(+), 8 deletions(-)
 create mode 100644 .github/workflows/test.yml

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 00000000..17dcc3ee
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,28 @@
+name: Run tests
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+
+jobs:
+  test:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        python-version: [3.7, 3.8, 3.9]
+        os: [ubuntu-latest, windows-latest]
+
+    steps:
+    - uses: actions/checkout@v2
+    
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    
+    - name: Install package
+      run: pip install .
+    
+    - name: Run tests
+      run: python -m unittest
diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index 61903d5f..2be98158 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -255,7 +255,7 @@ def fit(self) -> 'StringGrouper':
         matches, self._true_max_n_matches = self._build_matches(master_matrix, duplicate_matrix)
         if self._duplicates is None and self._max_n_matches < self._true_max_n_matches:
             # the list of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A)
-            matches = StringGrouper._symmetrize_matrix(matches)
+            matches = StringGrouper._symmetrize_matrix_and_fix_diagonal(matches)
         # build list from matrix
         self._matches_list = self._get_matches_list(matches)
         self.is_build = True
@@ -532,11 +532,10 @@ def _get_nearest_matches(self,
         dupes_max_sim = dupes_max_sim.sort_values('dupe_side').set_index('dupe_side')
         output = dupes_max_sim[index_column_list + required_column_list]
         output.index = self._duplicates.index
-        return output.squeeze()
+        return output.squeeze(axis=1)
 
     def _deduplicate(self, ignore_index=False) -> Union[pd.DataFrame, pd.Series]:
-        # discard self-matches: A matches A
-        pairs = self._matches_list[self._matches_list['master_side'] != self._matches_list['dupe_side']]
+        pairs = self._matches_list
         # rebuild graph adjacency matrix from already found matches:
         n = len(self._master)
         graph = csr_matrix(
@@ -564,7 +563,7 @@ def _deduplicate(self, ignore_index=False) -> Union[pd.DataFrame, pd.Series]:
             graph.data = pairs['similarity'].to_numpy()
             # sum along the rows to obtain numpy 1D matrix of similarity aggregates then ...
             # ... convert to 1D numpy array (using asarray then squeeze) and then to Series:
-            group_of_master_index['weight'] = pd.Series(np.asarray(graph.sum(axis=1)).squeeze())
+            group_of_master_index['weight'] = pd.Series(np.asarray(graph.sum(axis=1)).squeeze(axis=1))
             method = 'idxmax'
 
         # Determine the group representatives AND merge with indices:
@@ -588,7 +587,7 @@ def _deduplicate(self, ignore_index=False) -> Union[pd.DataFrame, pd.Series]:
             output_id = self._master_id.iloc[group_of_master_index.group_rep].rename(id_label).reset_index(drop=True)
             output = pd.concat([output_id, output], axis=1)
         output.index = self._master.index
-        return output.squeeze()
+        return output
 
     def _get_indices_of(self, master_side: str, dupe_side: str) -> Tuple[pd.Series, pd.Series]:
         master_strings = self._master
@@ -617,19 +616,22 @@ def _validate_replace_na_and_drop(self):
             )
 
     @staticmethod
-    def _symmetrize_matrix(AA: csr_matrix) -> csr_matrix:
+    def _symmetrize_matrix_and_fix_diagonal(AA: csr_matrix) -> csr_matrix:
         A = AA.tolil()
         r, c = A.nonzero()
         A[c, r] = A[r, c]
+        r = np.arange(A.shape[0])
+        A[r, r] = 1
         return A.tocsr()
 
     @staticmethod
     def _get_matches_list(matches: csr_matrix) -> pd.DataFrame:
         """Returns a list of all the indices of matches"""
         r, c = matches.nonzero()
-        return pd.DataFrame({'master_side': r.astype(np.int64),
+        matches_list = pd.DataFrame({'master_side': r.astype(np.int64),
                                      'dupe_side': c.astype(np.int64),
                                      'similarity': matches.data})
+        return matches_list
 
     @staticmethod
     def _make_symmetric(new_matches: pd.DataFrame) -> pd.DataFrame:

From 2c6b102d3883f93c3e45272f99e525996425bc85 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Wed, 5 May 2021 00:49:25 +0200
Subject: [PATCH 18/29] fixed bug related to single-valued input Series

---
 string_grouper/string_grouper.py           | 29 ++++++++++++++++------
 string_grouper/test/test_string_grouper.py | 29 ++++++++++++++++++----
 2 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index 2be98158..d0b1844c 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -251,11 +251,21 @@ def n_grams(self, string: str) -> List[str]:
     def fit(self) -> 'StringGrouper':
         """Builds the _matches list which contains string matches indices and similarity"""
         master_matrix, duplicate_matrix = self._get_tf_idf_matrices()
+        
         # Calculate the matches using the cosine similarity
         matches, self._true_max_n_matches = self._build_matches(master_matrix, duplicate_matrix)
-        if self._duplicates is None and self._max_n_matches < self._true_max_n_matches:
-            # the list of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A)
-            matches = StringGrouper._symmetrize_matrix_and_fix_diagonal(matches)
+        
+        if self._duplicates is None:
+            # convert to lil format for best efficiency when setting matrix-elements
+            matches = matches.tolil() 
+            # matrix diagonal elements must be exactly 1 (numerical precision errors introduced by 
+            # floating-point computations in awesome_cossim_topn sometimes lead to unexpected results)
+            matches = StringGrouper._fix_diagonal(matches)
+            if self._max_n_matches < self._true_max_n_matches:
+                # the list of matches must be symmetric! (i.e., if A != B and A matches B; then B matches A)
+                matches = StringGrouper._symmetrize_matrix(matches)
+            matches = matches.tocsr()
+        
         # build list from matrix
         self._matches_list = self._get_matches_list(matches)
         self.is_build = True
@@ -616,13 +626,16 @@ def _validate_replace_na_and_drop(self):
             )
 
     @staticmethod
-    def _symmetrize_matrix_and_fix_diagonal(AA: csr_matrix) -> csr_matrix:
-        A = AA.tolil()
-        r, c = A.nonzero()
-        A[c, r] = A[r, c]
+    def _fix_diagonal(A) -> csr_matrix:
         r = np.arange(A.shape[0])
         A[r, r] = 1
-        return A.tocsr()
+        return A
+
+    @staticmethod
+    def _symmetrize_matrix(A) -> csr_matrix:
+        r, c = A.nonzero()
+        A[c, r] = A[r, c]
+        return A
 
     @staticmethod
     def _get_matches_list(matches: csr_matrix) -> pd.DataFrame:
diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py
index d5c1dd0b..383f4b11 100644
--- a/string_grouper/test/test_string_grouper.py
+++ b/string_grouper/test/test_string_grouper.py
@@ -197,7 +197,10 @@ def test_match_strings(self, mock_StringGouper):
         mock_StringGrouper_instance.get_matches.assert_called_once()
         self.assertEqual(df, 'whatever')
 
-    @patch('string_grouper.string_grouper.StringGrouper._symmetrize_matrix', side_effect=mock_symmetrize_matrix)
+    @patch(
+        'string_grouper.string_grouper.StringGrouper._symmetrize_matrix',
+        side_effect=mock_symmetrize_matrix
+    )
     def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_matrix):
         """mocks StringGrouper._symmetrize_matches_list so that this test fails whenever _matches_list is 
         **partially** symmetric which often occurs when the kwarg max_n_matches is too small"""
@@ -236,17 +239,33 @@ def test_match_list_symmetry_with_symmetrize_function(self):
         # upper, upper_prime and their intersection should be identical.
         self.assertTrue(intersection.empty or len(upper) == len(upper_prime) == len(intersection))
 
-    def test_match_list_diagonal(self):
+    @patch(
+        'string_grouper.string_grouper.StringGrouper._fix_diagonal',
+        side_effect=mock_symmetrize_matrix
+    )
+    def test_match_list_diagonal_without_the_fix(self, mock_fix_diagonal):
         """test fails whenever _matches_list's number of self-joins is not equal to the number of strings"""
         # This bug is difficult to reproduce -- I mostly encounter it while working with very large datasets;
         # for small datasets setting max_n_matches=1 reproduces the bug
         simple_example = SimpleExample()
         df = simple_example.customers_df['Customer Name']
         matches = match_strings(df, max_n_matches=1)
+        mock_fix_diagonal.assert_called_once()
         num_self_joins = len(matches[matches['left_index'] == matches['right_index']])
         num_strings = len(df)
         self.assertNotEqual(num_self_joins, num_strings)
 
+    def test_match_list_diagonal(self):
+        """This test ensures that all self-joins are present"""
+        # This bug is difficult to reproduce -- I mostly encounter it while working with very large datasets;
+        # for small datasets setting max_n_matches=1 reproduces the bug
+        simple_example = SimpleExample()
+        df = simple_example.customers_df['Customer Name']
+        matches = match_strings(df, max_n_matches=1)
+        num_self_joins = len(matches[matches['left_index'] == matches['right_index']])
+        num_strings = len(df)
+        self.assertEqual(num_self_joins, num_strings)
+
     def test_zero_min_similarity(self):
         """Since sparse matrices exclude zero elements, this test ensures that zero similarity matches are 
         returned when min_similarity <= 0.  A bug related to this was first pointed out by @nbcvijanovic"""
@@ -381,7 +400,7 @@ def test_get_matches_single(self):
         left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
         right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
         left_index = [0, 0, 1, 2, 3, 3]
-        right_index = [3, 0, 1, 2, 3, 0]
+        right_index = [0, 3, 1, 2, 0, 3]
         similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
         expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side,
                                     'similarity': similarity,
@@ -397,8 +416,8 @@ def test_get_matches_1_series_1_id_series(self):
         left_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3']
         left_index = [0, 0, 1, 2, 3, 3]
         right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
-        right_side_id = ['A3', 'A0', 'A1', 'A2', 'A3', 'A0']
-        right_index = [3, 0, 1, 2, 3, 0]
+        right_side_id = ['A0', 'A3', 'A1', 'A2', 'A0', 'A3']
+        right_index = [0, 3, 1, 2, 0, 3]
         similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
         expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id,
                                     'similarity': similarity,

From 1b8ddecf48bb7eaecdacc3f89fd7b84eadef1321 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Wed, 5 May 2021 01:06:23 +0200
Subject: [PATCH 19/29] modified GitHub workflow action script test.yml

---
 .github/workflows/test.yml                       | 8 ++++++--
 setup.py                                         | 3 +--
 sparse_dot_topn/sparse_dot_topn_parallel.cpp     | 3 ++-
 sparse_dot_topn/test/test_awesome_cossim_topn.py | 2 +-
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 17dcc3ee..93336b1e 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -22,7 +22,11 @@ jobs:
         python-version: ${{ matrix.python-version }}
     
     - name: Install package
-      run: pip install .
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e .
     
     - name: Run tests
-      run: python -m unittest
+      run: |
+        pip install pytest
+        pytest -ra --capture=no --showlocals
diff --git a/setup.py b/setup.py
index cf5d5fee..8c51b072 100644
--- a/setup.py
+++ b/setup.py
@@ -31,8 +31,7 @@ def finalize_options(self):
 
 array_wrappers_ext = Extension('sparse_dot_topn.array_wrappers',
                          sources=[
-                                    './sparse_dot_topn/array_wrappers.pyx',
-                                    './sparse_dot_topn/sparse_dot_topn_source.cpp'
+                                    './sparse_dot_topn/array_wrappers.pyx'
                                 ],
                          extra_compile_args=extra_compile_args,
                          language='c++')
diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/sparse_dot_topn/sparse_dot_topn_parallel.cpp
index 522f9e72..0efb7a45 100644
--- a/sparse_dot_topn/sparse_dot_topn_parallel.cpp
+++ b/sparse_dot_topn/sparse_dot_topn_parallel.cpp
@@ -20,6 +20,7 @@
 // Modified by: Particular Miner
 // April 14, 2021
 
+#include <cmath>
 #include <vector>
 #include <algorithm>
 #include <numeric>
@@ -398,7 +399,7 @@ int sparse_dot_topn_extd_parallel(
 	std::vector<int> sub_total(n_jobs, 0);
 	std::vector<int> split_n_minmax(n_jobs, 0);
 
-	int mem_sz_per_row = std::max(1, (int) ceil(((double) nnz_max)/((double) n_row)));
+	int mem_sz_per_row = std::max(1, (int) std::ceil(((double) nnz_max)/((double) n_row)));
 
 	std::vector<std::thread> thread_list(n_jobs);
 
diff --git a/sparse_dot_topn/test/test_awesome_cossim_topn.py b/sparse_dot_topn/test/test_awesome_cossim_topn.py
index 5560ccc3..a9734668 100644
--- a/sparse_dot_topn/test/test_awesome_cossim_topn.py
+++ b/sparse_dot_topn/test/test_awesome_cossim_topn.py
@@ -280,7 +280,7 @@ def test_awesome_cossim_top_large_matrix():
     # test with large matrix
     nr_vocab = 2 << 24
     density = 1e-6
-    n_samples = 10000
+    n_samples = 1000
     nnz = int(n_samples * nr_vocab * density)
 
     rng1 = np.random.RandomState(42)

From 75fdf3d2006cff5e8a9a8e327a1e918c17b091ce Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Wed, 5 May 2021 11:01:40 +0200
Subject: [PATCH 20/29] renamed sparse_dot_topn sub-package to
 string_grouper_topn to avoid

possible conflicts with original pypi package sparse_dot_topn
---
 .github/workflows/test.yml                    |  4 +-
 setup.py                                      | 44 +++++++++----------
 string_grouper/string_grouper.py              |  2 +-
 .../__init__.py                               |  2 +-
 .../array_wrappers.pxd                        |  0
 .../array_wrappers.pyx                        |  0
 .../awesome_cossim_topn.py                    |  5 +--
 .../example/comparison.py                     |  2 +-
 .../example/comparison2.py                    |  2 +-
 .../example/comparison3.py                    |  2 +-
 .../example/example.py                        |  2 +-
 .../sparse_dot_topn.pyx                       |  0
 .../sparse_dot_topn_parallel.cpp              |  0
 .../sparse_dot_topn_parallel.h                |  0
 .../sparse_dot_topn_source.cpp                |  0
 .../sparse_dot_topn_source.h                  |  0
 .../sparse_dot_topn_threaded.pyx              |  0
 .../test/test_awesome_cossim_topn.py          |  2 +-
 string_grouper_utils/string_grouper_utils.py  |  6 +--
 19 files changed, 36 insertions(+), 37 deletions(-)
 rename {sparse_dot_topn => string_grouper_topn}/__init__.py (60%)
 rename {sparse_dot_topn => string_grouper_topn}/array_wrappers.pxd (100%)
 rename {sparse_dot_topn => string_grouper_topn}/array_wrappers.pyx (100%)
 rename {sparse_dot_topn => string_grouper_topn}/awesome_cossim_topn.py (96%)
 rename {sparse_dot_topn => string_grouper_topn}/example/comparison.py (98%)
 rename {sparse_dot_topn => string_grouper_topn}/example/comparison2.py (98%)
 rename {sparse_dot_topn => string_grouper_topn}/example/comparison3.py (96%)
 rename {sparse_dot_topn => string_grouper_topn}/example/example.py (86%)
 rename {sparse_dot_topn => string_grouper_topn}/sparse_dot_topn.pyx (100%)
 rename {sparse_dot_topn => string_grouper_topn}/sparse_dot_topn_parallel.cpp (100%)
 rename {sparse_dot_topn => string_grouper_topn}/sparse_dot_topn_parallel.h (100%)
 rename {sparse_dot_topn => string_grouper_topn}/sparse_dot_topn_source.cpp (100%)
 rename {sparse_dot_topn => string_grouper_topn}/sparse_dot_topn_source.h (100%)
 rename {sparse_dot_topn => string_grouper_topn}/sparse_dot_topn_threaded.pyx (100%)
 rename {sparse_dot_topn => string_grouper_topn}/test/test_awesome_cossim_topn.py (99%)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 93336b1e..5317a62d 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -21,10 +21,10 @@ jobs:
       with:
         python-version: ${{ matrix.python-version }}
     
-    - name: Install package
+    - name: Install dev-package
       run: |
         python -m pip install --upgrade pip
-        pip install -e .
+        pip install -v -e .
     
     - name: Run tests
       run: |
diff --git a/setup.py b/setup.py
index 8c51b072..cf333180 100644
--- a/setup.py
+++ b/setup.py
@@ -29,27 +29,27 @@ def finalize_options(self):
 else:
     extra_compile_args = ['-std=c++0x', '-pthread', '-O3']
 
-array_wrappers_ext = Extension('sparse_dot_topn.array_wrappers',
+array_wrappers_ext = Extension('string_grouper_topn.array_wrappers',
                          sources=[
-                                    './sparse_dot_topn/array_wrappers.pyx'
+                                    './string_grouper_topn/array_wrappers.pyx',
                                 ],
                          extra_compile_args=extra_compile_args,
                          language='c++')
 
-original_ext = Extension('sparse_dot_topn.sparse_dot_topn',
+original_ext = Extension('string_grouper_topn.sparse_dot_topn',
                          sources=[
-                                    './sparse_dot_topn/sparse_dot_topn.pyx',
-                                    './sparse_dot_topn/sparse_dot_topn_source.cpp'
+                                    './string_grouper_topn/sparse_dot_topn.pyx',
+                                    './string_grouper_topn/sparse_dot_topn_source.cpp',
                                 ],
                          extra_compile_args=extra_compile_args,
                          define_macros=[('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')],
                          language='c++')
 
-threaded_ext = Extension('sparse_dot_topn.sparse_dot_topn_threaded',
+threaded_ext = Extension('string_grouper_topn.sparse_dot_topn_threaded',
                          sources=[
-                             './sparse_dot_topn/sparse_dot_topn_threaded.pyx',
-                             './sparse_dot_topn/sparse_dot_topn_source.cpp',
-                             './sparse_dot_topn/sparse_dot_topn_parallel.cpp'
+                             './string_grouper_topn/sparse_dot_topn_threaded.pyx',
+                             './string_grouper_topn/sparse_dot_topn_source.cpp',
+                             './string_grouper_topn/sparse_dot_topn_parallel.cpp',
                             ],
                          extra_compile_args=extra_compile_args,
                          define_macros=[('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')],
@@ -59,9 +59,9 @@ def finalize_options(self):
     name='string_grouper',
     version='0.4.0',
     packages=[
-        'string_grouper'
-        , 'string_grouper_utils'
-        , 'sparse_dot_topn'
+        'string_grouper_topn',
+        'string_grouper',
+        'string_grouper_utils',
     ],
     license='MIT License',
     description='String grouper contains functions to do string matching using TF-IDF and the cossine similarity. '
@@ -75,18 +75,18 @@ def finalize_options(self):
     zip_safe=False,
     python_requires='>3.7',
     setup_requires=[# Setuptools 18.0 properly handles Cython extensions.
-                    'setuptools>=18.0'
-                    , 'cython>=0.29.15'
-                    , 'numpy'
-                    , 'scipy'
+                    'setuptools>=18.0',
+                    'cython>=0.29.15',
+                    'numpy',
+                    'scipy',
     ],
     install_requires=[# Setuptools 18.0 properly handles Cython extensions.
-                      'setuptools>=18.0'
-                      , 'cython>=0.29.15'
-                      , 'numpy'
-                      , 'scipy'
-                      , 'scikit-learn'
-                      , 'pandas>=0.25.3'
+                      'setuptools>=18.0',
+                      'cython>=0.29.15',
+                      'numpy',
+                      'scipy',
+                      'scikit-learn',
+                      'pandas>=0.25.3',
     ],
     cmdclass={'build_ext': my_build_ext},
     ext_modules=[array_wrappers_ext, original_ext, threaded_ext],
diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index d0b1844c..a2991475 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -6,7 +6,7 @@
 from scipy.sparse.csr import csr_matrix
 from scipy.sparse.csgraph import connected_components
 from typing import Tuple, NamedTuple, List, Optional, Union
-from sparse_dot_topn import awesome_cossim_topn
+from string_grouper_topn import awesome_cossim_topn
 from functools import wraps
 
 DEFAULT_NGRAM_SIZE: int = 3
diff --git a/sparse_dot_topn/__init__.py b/string_grouper_topn/__init__.py
similarity index 60%
rename from sparse_dot_topn/__init__.py
rename to string_grouper_topn/__init__.py
index cbaf32a7..b123439e 100644
--- a/sparse_dot_topn/__init__.py
+++ b/string_grouper_topn/__init__.py
@@ -2,6 +2,6 @@
 import sys
 
 if sys.version_info[0] >= 3:
-    from sparse_dot_topn.awesome_cossim_topn import awesome_cossim_topn
+    from string_grouper_topn.awesome_cossim_topn import awesome_cossim_topn
 else:
     from awesome_cossim_topn import awesome_cossim_topn
\ No newline at end of file
diff --git a/sparse_dot_topn/array_wrappers.pxd b/string_grouper_topn/array_wrappers.pxd
similarity index 100%
rename from sparse_dot_topn/array_wrappers.pxd
rename to string_grouper_topn/array_wrappers.pxd
diff --git a/sparse_dot_topn/array_wrappers.pyx b/string_grouper_topn/array_wrappers.pyx
similarity index 100%
rename from sparse_dot_topn/array_wrappers.pyx
rename to string_grouper_topn/array_wrappers.pyx
diff --git a/sparse_dot_topn/awesome_cossim_topn.py b/string_grouper_topn/awesome_cossim_topn.py
similarity index 96%
rename from sparse_dot_topn/awesome_cossim_topn.py
rename to string_grouper_topn/awesome_cossim_topn.py
index 380c6e6e..4f90ae63 100644
--- a/sparse_dot_topn/awesome_cossim_topn.py
+++ b/string_grouper_topn/awesome_cossim_topn.py
@@ -2,12 +2,11 @@
 import numpy as np
 from scipy.sparse import csr_matrix
 from scipy.sparse import isspmatrix_csr
-from _ast import Continue
 from numpy import indices
 
 if sys.version_info[0] >= 3:
-    from sparse_dot_topn import sparse_dot_topn as ct
-    from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread
+    from string_grouper_topn import sparse_dot_topn as ct
+    from string_grouper_topn import sparse_dot_topn_threaded as ct_thread
 else:
     import sparse_dot_topn as ct
     import sparse_dot_topn_threaded as ct_thread
diff --git a/sparse_dot_topn/example/comparison.py b/string_grouper_topn/example/comparison.py
similarity index 98%
rename from sparse_dot_topn/example/comparison.py
rename to string_grouper_topn/example/comparison.py
index 7ee673ca..ce3cc0ad 100644
--- a/sparse_dot_topn/example/comparison.py
+++ b/string_grouper_topn/example/comparison.py
@@ -6,7 +6,7 @@
 import timeit
 import numpy as np
 from scipy.sparse import coo_matrix
-from sparse_dot_topn import awesome_cossim_topn  # noqa: F401
+from string_grouper_topn import awesome_cossim_topn  # noqa: F401
 
 N = 1000
 thresh = 0.01
diff --git a/sparse_dot_topn/example/comparison2.py b/string_grouper_topn/example/comparison2.py
similarity index 98%
rename from sparse_dot_topn/example/comparison2.py
rename to string_grouper_topn/example/comparison2.py
index c79cb45f..5cc631f1 100644
--- a/sparse_dot_topn/example/comparison2.py
+++ b/string_grouper_topn/example/comparison2.py
@@ -7,7 +7,7 @@
 import numpy as np
 import pandas as pd
 from scipy.sparse import coo_matrix
-from sparse_dot_topn import awesome_cossim_topn  # noqa: F401
+from string_grouper_topn import awesome_cossim_topn  # noqa: F401
 
 df = pd.DataFrame(columns=['sample', '#threads', 'python'])
 
diff --git a/sparse_dot_topn/example/comparison3.py b/string_grouper_topn/example/comparison3.py
similarity index 96%
rename from sparse_dot_topn/example/comparison3.py
rename to string_grouper_topn/example/comparison3.py
index c0bcf145..74983dde 100644
--- a/sparse_dot_topn/example/comparison3.py
+++ b/string_grouper_topn/example/comparison3.py
@@ -8,7 +8,7 @@
 import numpy as np
 import pandas as pd
 from scipy.sparse import load_npz
-from sparse_dot_topn import awesome_cossim_topn  # noqa: F401
+from string_grouper_topn import awesome_cossim_topn  # noqa: F401
 
 a = load_npz('sparse_matrix_A.npz')
 b = load_npz('sparse_matrix_B.npz')
diff --git a/sparse_dot_topn/example/example.py b/string_grouper_topn/example/example.py
similarity index 86%
rename from sparse_dot_topn/example/example.py
rename to string_grouper_topn/example/example.py
index a61951fd..a403d3ab 100644
--- a/sparse_dot_topn/example/example.py
+++ b/string_grouper_topn/example/example.py
@@ -1,5 +1,5 @@
 from scipy.sparse import rand
-from sparse_dot_topn import awesome_cossim_topn
+from string_grouper_topn import awesome_cossim_topn
 
 N = 10
 a = rand(100, 1000000, density=0.005, format='csr')
diff --git a/sparse_dot_topn/sparse_dot_topn.pyx b/string_grouper_topn/sparse_dot_topn.pyx
similarity index 100%
rename from sparse_dot_topn/sparse_dot_topn.pyx
rename to string_grouper_topn/sparse_dot_topn.pyx
diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/string_grouper_topn/sparse_dot_topn_parallel.cpp
similarity index 100%
rename from sparse_dot_topn/sparse_dot_topn_parallel.cpp
rename to string_grouper_topn/sparse_dot_topn_parallel.cpp
diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.h b/string_grouper_topn/sparse_dot_topn_parallel.h
similarity index 100%
rename from sparse_dot_topn/sparse_dot_topn_parallel.h
rename to string_grouper_topn/sparse_dot_topn_parallel.h
diff --git a/sparse_dot_topn/sparse_dot_topn_source.cpp b/string_grouper_topn/sparse_dot_topn_source.cpp
similarity index 100%
rename from sparse_dot_topn/sparse_dot_topn_source.cpp
rename to string_grouper_topn/sparse_dot_topn_source.cpp
diff --git a/sparse_dot_topn/sparse_dot_topn_source.h b/string_grouper_topn/sparse_dot_topn_source.h
similarity index 100%
rename from sparse_dot_topn/sparse_dot_topn_source.h
rename to string_grouper_topn/sparse_dot_topn_source.h
diff --git a/sparse_dot_topn/sparse_dot_topn_threaded.pyx b/string_grouper_topn/sparse_dot_topn_threaded.pyx
similarity index 100%
rename from sparse_dot_topn/sparse_dot_topn_threaded.pyx
rename to string_grouper_topn/sparse_dot_topn_threaded.pyx
diff --git a/sparse_dot_topn/test/test_awesome_cossim_topn.py b/string_grouper_topn/test/test_awesome_cossim_topn.py
similarity index 99%
rename from sparse_dot_topn/test/test_awesome_cossim_topn.py
rename to string_grouper_topn/test/test_awesome_cossim_topn.py
index a9734668..80a71431 100644
--- a/sparse_dot_topn/test/test_awesome_cossim_topn.py
+++ b/string_grouper_topn/test/test_awesome_cossim_topn.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-from sparse_dot_topn import awesome_cossim_topn
+from string_grouper_topn import awesome_cossim_topn
 from scipy.sparse.csr import csr_matrix
 from scipy.sparse import coo_matrix
 from scipy.sparse import rand
diff --git a/string_grouper_utils/string_grouper_utils.py b/string_grouper_utils/string_grouper_utils.py
index 11803a32..a570b377 100644
--- a/string_grouper_utils/string_grouper_utils.py
+++ b/string_grouper_utils/string_grouper_utils.py
@@ -1,7 +1,7 @@
-import numpy as np
 import pandas as pd
 from typing import List, Optional, Union
 from dateutil.parser import parse
+from dateutil.tz import UTC
 from numbers import Number
 from datetime import datetime
 import re
@@ -143,13 +143,13 @@ def parse_timestamps(timestamps: pd.Series, parserinfo=None, **kwargs) -> pd.Ser
         # if any of the strings is not datetime-like raise an exception
         if timestamps.to_frame().applymap(is_date).squeeze().all():
             # convert strings to numpy datetime64
-            return timestamps.transform(lambda x: np.datetime64(parse(x, parserinfo, **kwargs)))
+            return timestamps.transform(lambda x: parse(x, parserinfo, **kwargs).astimezone(UTC))
     elif is_series_of_type(type(pd.Timestamp('15-1-2000')), timestamps):
         # convert pandas Timestamps to numpy datetime64
         return timestamps.transform(lambda x: x.to_numpy())
     elif is_series_of_type(datetime, timestamps):
         # convert python datetimes to numpy datetime64
-        return timestamps.transform(lambda x: np.datetime64(x))
+        return timestamps.transform(lambda x: x.astimezone(UTC))
     elif is_series_of_type(Number, timestamps):
         return timestamps
     raise Exception(error_msg)

From 29dcb4204a186e628f7269fe1329ff9149fb492b Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Wed, 5 May 2021 14:49:33 +0200
Subject: [PATCH 21/29] added unittest for get_groups() with single-valued
 input Series

---
 string_grouper/test/test_string_grouper.py | 37 ++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py
index 383f4b11..4344177a 100644
--- a/string_grouper/test/test_string_grouper.py
+++ b/string_grouper/test/test_string_grouper.py
@@ -488,6 +488,43 @@ def test_get_groups_single_df_group_rep_default(self):
             )
         )
 
+    def test_get_groups_single_valued_series(self):
+        """This test ensures that get_groups() returns a single-valued DataFrame or Series object
+        since the input-series is also single-valued.  This test was created in response to a bug discovered
+        by George Walker"""
+        pd.testing.assert_frame_equal(
+            pd.DataFrame([(0, "hello")], columns=['group_rep_index', 'group_rep']),
+            group_similar_strings(
+                pd.Series(["hello"]),
+                min_similarity=0.6
+            )
+        )
+        pd.testing.assert_series_equal(
+            pd.Series(["hello"], name='group_rep'),
+            group_similar_strings(
+                pd.Series(["hello"]),
+                min_similarity=0.6,
+                ignore_index=True
+            )
+        )
+        pd.testing.assert_frame_equal(
+            pd.DataFrame([(0, "hello")], columns=['most_similar_index', 'most_similar_master']),
+            match_most_similar(
+                pd.Series(["hello"]),
+                pd.Series(["hello"]),
+                min_similarity=0.6
+            )
+        )
+        pd.testing.assert_series_equal(
+            pd.Series(["hello"], name='most_similar_master'),
+            match_most_similar(
+                pd.Series(["hello"]),
+                pd.Series(["hello"]),
+                min_similarity=0.6,
+                ignore_index=True
+            )
+        )
+
     def test_get_groups_single_df_keep_index(self):
         """Should return a pd.Series object with the same length as the original df. The series object will contain
         a list of the grouped strings with their indexes displayed in columns"""

From 6f6ff50101a9383a73c07c66ba1b256f4d7edc5a Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Sat, 8 May 2021 10:11:13 +0200
Subject: [PATCH 22/29] fixed other squeeze() bugs

---
 string_grouper/string_grouper.py           |  4 +--
 string_grouper_topn/example/comparison2.py | 35 ++++++++++++++--------
 2 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index a2991475..c4cfbdef 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -277,7 +277,7 @@ def dot(self) -> pd.Series:
             raise Exception("To perform this function, both input Series must have the same length.")
         master_matrix, duplicate_matrix = self._get_tf_idf_matrices()
         # Calculate pairwise cosine similarities:
-        pairwise_similarities = np.asarray(master_matrix.multiply(duplicate_matrix).sum(axis=1)).squeeze()
+        pairwise_similarities = np.asarray(master_matrix.multiply(duplicate_matrix).sum(axis=1)).squeeze(axis=1)
         return pd.Series(pairwise_similarities, name='similarity', index=self._master.index)
 
     @validate_is_fit
@@ -673,7 +673,7 @@ def _is_series_of_strings(series_to_test: pd.Series) -> bool:
             return False
         elif series_to_test.to_frame().applymap(
                     lambda x: not isinstance(x, str)
-                ).squeeze().any():
+                ).squeeze(axis=1).any():
             return False
         return True
 
diff --git a/string_grouper_topn/example/comparison2.py b/string_grouper_topn/example/comparison2.py
index 5cc631f1..ca4e1fff 100644
--- a/string_grouper_topn/example/comparison2.py
+++ b/string_grouper_topn/example/comparison2.py
@@ -8,6 +8,8 @@
 import pandas as pd
 from scipy.sparse import coo_matrix
 from string_grouper_topn import awesome_cossim_topn  # noqa: F401
+from test.sortperf import flush
+from _sqlite3 import Row
 
 df = pd.DataFrame(columns=['sample', '#threads', 'python'])
 
@@ -16,7 +18,7 @@
 
 nr_vocab = int(26**3)
 density = 30/nr_vocab
-n_samples = 1000000
+n_samples = 10000000
 n_duplicates = N
 nnz_a = int(n_samples * nr_vocab * density)
 nnz_b = int(n_duplicates * nr_vocab * density)
@@ -38,22 +40,30 @@
 ntop_arr = np.full(n_matrix_pairs, 0)
 r = 0
 for it in range(n_matrix_pairs):
+    print('Building matrices ...', end='', flush=True)
     
-    row = rng1.randint(n_samples, size=nnz_a)
-    cols = rng1.randint(nr_vocab, size=nnz_a)
-    data = rng1.rand(nnz_a)
+    row = np.repeat(np.arange(n_samples), int(nr_vocab*density))
+    cols = np.asarray([  rng1.randint(nr_vocab, size=int(nr_vocab*density)) for _ in range(n_samples)  ]).flatten()
+    data = rng1.rand(len(row))
     
-    a_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab))
-    a = a_sparse.tocsr()
+    a = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab))
+    a = a.tocsr()
     
-    row = rng1.randint(n_duplicates, size=nnz_b)
-    cols = rng1.randint(nr_vocab, size=nnz_b)
-    data = rng1.rand(nnz_b)
+    row = np.repeat(np.arange(n_duplicates), int(nr_vocab*density))
+    cols = np.asarray([  rng1.randint(nr_vocab, size=int(nr_vocab*density)) for _ in range(n_duplicates)  ]).flatten()
+    data = rng1.rand(len(row))
     
-    b_sparse = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab))
-    b = b_sparse.T.tocsr()
+    b = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab))
+    b = b.T.tocsr()
     
-    C, C_ntop = awesome_cossim_topn(a, b, N, thresh, return_best_ntop=True)
+    del row
+    del cols
+    del data
+    
+    print('Finished.', flush=True)
+
+    print('Computing matrix product ...', flush=True)
+    C, C_ntop = awesome_cossim_topn(a, b, N, thresh, return_best_ntop=True, use_threads=True, n_jobs=4)
     print(f'nnz(A*B) = {len(C.data)}', flush=True)
     print(f'ntop(A*B) = {C_ntop}', flush=True)
     print('', flush=True)
@@ -61,6 +71,7 @@
     ntop_arr[it] = C_ntop
     del C
     del C_ntop
+    print('Finished.', flush=True)
     
     # top 5 results per row
     

From 90a6fd193ec8c4f6af730cd41c8af43befe80594 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Tue, 11 May 2021 10:00:17 +0200
Subject: [PATCH 23/29] made PEP8-conforming modifications

---
 string_grouper/string_grouper.py              | 106 +++++++++---------
 string_grouper/test/test_string_grouper.py    |  26 +++--
 string_grouper_topn/awesome_cossim_topn.py    |  25 ++---
 string_grouper_topn/example/comparison.py     |   3 +-
 string_grouper_topn/example/comparison2.py    |  56 +++++----
 string_grouper_topn/example/comparison3.py    |  11 +-
 .../test/test_awesome_cossim_topn.py          |   4 +-
 string_grouper_utils/string_grouper_utils.py  |   4 +-
 .../test/test_string_grouper_utils.py         |   4 +-
 9 files changed, 121 insertions(+), 118 deletions(-)

diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index c4cfbdef..7ebdaa82 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -4,6 +4,7 @@
 import multiprocessing
 from sklearn.feature_extraction.text import TfidfVectorizer
 from scipy.sparse.csr import csr_matrix
+from scipy.sparse.lil import lil_matrix
 from scipy.sparse.csgraph import connected_components
 from typing import Tuple, NamedTuple, List, Optional, Union
 from string_grouper_topn import awesome_cossim_topn
@@ -17,24 +18,24 @@
 DEFAULT_IGNORE_CASE: bool = True  # ignores case by default
 DEFAULT_DROP_INDEX: bool = False  # includes index-columns in output
 DEFAULT_REPLACE_NA: bool = False    # when finding the most similar strings, does not replace NaN values in most
-                                    # similar string index-columns with corresponding duplicates-index values
-DEFAULT_INCLUDE_ZEROES: bool = True # when the minimum cosine similarity <=0, determines whether zero-similarity
-                                    # matches appear in the output 
+# similar string index-columns with corresponding duplicates-index values
+DEFAULT_INCLUDE_ZEROES: bool = True  # when the minimum cosine similarity <=0, determines whether zero-similarity
+# matches appear in the output
 GROUP_REP_CENTROID: str = 'centroid'    # Option value to select the string in each group with the largest
-                                        # similarity aggregate as group-representative:
+# similarity aggregate as group-representative:
 GROUP_REP_FIRST: str = 'first'  # Option value to select the first string in each group as group-representative:
-DEFAULT_GROUP_REP: str = GROUP_REP_CENTROID # chooses group centroid as group-representative by default
+DEFAULT_GROUP_REP: str = GROUP_REP_CENTROID  # chooses group centroid as group-representative by default
 
 # The following string constants are used by (but aren't [yet] options passed to) StringGrouper
 DEFAULT_COLUMN_NAME: str = 'side'   # used to name non-index columns of the output of StringGrouper.get_matches
-DEFAULT_ID_NAME: str = 'id' # used to name id-columns in the output of StringGrouper.get_matches
+DEFAULT_ID_NAME: str = 'id'  # used to name id-columns in the output of StringGrouper.get_matches
 LEFT_PREFIX: str = 'left_'  # used to prefix columns on the left of the output of StringGrouper.get_matches
 RIGHT_PREFIX: str = 'right_'    # used to prefix columns on the right of the output of StringGrouper.get_matches
 MOST_SIMILAR_PREFIX: str = 'most_similar_'  # used to prefix columns of the output of
-                                            # StringGrouper._get_nearest_matches
-DEFAULT_MASTER_NAME: str = 'master' # used to name non-index column of the output of StringGrouper.get_nearest_matches
+# StringGrouper._get_nearest_matches
+DEFAULT_MASTER_NAME: str = 'master'  # used to name non-index column of the output of StringGrouper.get_nearest_matches
 DEFAULT_MASTER_ID_NAME: str = f'{DEFAULT_MASTER_NAME}_{DEFAULT_ID_NAME}'    # used to name id-column of the output of
-                                                                            # StringGrouper.get_nearest_matches
+# StringGrouper.get_nearest_matches
 GROUP_REP_PREFIX: str = 'group_rep_'    # used to prefix and name columns of the output of StringGrouper._deduplicate
 
 # High level functions
@@ -147,9 +148,9 @@ class StringGrouperConfig(NamedTuple):
     Defaults to number of cores on a machine - 1.
     :param ignore_case: bool. Whether or not case should be ignored. Defaults to True (ignore case).
     :param ignore_index: whether or not to exclude string Series index-columns in output.  Defaults to False.
-    :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches 
+    :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
     appear in the output.  Defaults to True.
-    :param replace_na: whether or not to replace NaN values in most similar string index-columns with 
+    :param replace_na: whether or not to replace NaN values in most similar string index-columns with
     corresponding duplicates-index values. Defaults to False.
     :param group_rep: str.  The scheme to select the group-representative.  Default is 'centroid'.
     The other choice is 'first'.
@@ -231,8 +232,8 @@ def __init__(self, master: pd.Series,
         self._vectorizer = TfidfVectorizer(min_df=1, analyzer=self.n_grams)
         # After the StringGrouper is built, _matches_list will contain the indices and similarities of the matches
         self._matches_list: pd.DataFrame = pd.DataFrame()
-        # _true_max_n_matches will contain the true maximum number of matches over all strings in master if 
-        # self._config.min_similarity <= 0 
+        # _true_max_n_matches will contain the true maximum number of matches over all strings in master if
+        # self._config.min_similarity <= 0
         self._true_max_n_matches = None
 
     def n_grams(self, string: str) -> List[str]:
@@ -251,21 +252,21 @@ def n_grams(self, string: str) -> List[str]:
     def fit(self) -> 'StringGrouper':
         """Builds the _matches list which contains string matches indices and similarity"""
         master_matrix, duplicate_matrix = self._get_tf_idf_matrices()
-        
+
         # Calculate the matches using the cosine similarity
         matches, self._true_max_n_matches = self._build_matches(master_matrix, duplicate_matrix)
-        
+
         if self._duplicates is None:
             # convert to lil format for best efficiency when setting matrix-elements
-            matches = matches.tolil() 
-            # matrix diagonal elements must be exactly 1 (numerical precision errors introduced by 
+            matches = matches.tolil()
+            # matrix diagonal elements must be exactly 1 (numerical precision errors introduced by
             # floating-point computations in awesome_cossim_topn sometimes lead to unexpected results)
             matches = StringGrouper._fix_diagonal(matches)
             if self._max_n_matches < self._true_max_n_matches:
                 # the list of matches must be symmetric! (i.e., if A != B and A matches B; then B matches A)
                 matches = StringGrouper._symmetrize_matrix(matches)
             matches = matches.tocsr()
-        
+
         # build list from matrix
         self._matches_list = self._get_matches_list(matches)
         self.is_build = True
@@ -283,14 +284,14 @@ def dot(self) -> pd.Series:
     @validate_is_fit
     def get_matches(self,
                     ignore_index: Optional[bool] = None,
-                    include_zeroes: Optional[bool]=None) -> pd.DataFrame:
+                    include_zeroes: Optional[bool] = None) -> pd.DataFrame:
         """
         Returns a DataFrame with all the matches and their cosine similarity.
         If optional IDs are used, returned as extra columns with IDs matched to respective data rows
 
-        :param ignore_index: whether or not to exclude string Series index-columns in output.  Defaults to 
+        :param ignore_index: whether or not to exclude string Series index-columns in output.  Defaults to
         self._config.ignore_index.
-        :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches 
+        :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
         appear in the output.  Defaults to self._config.include_zeroes.
         """
         def get_both_sides(master: pd.Series,
@@ -313,18 +314,20 @@ def prefix_column_names(data: Union[pd.Series, pd.DataFrame], prefix: str):
             else:
                 return data.rename(f"{prefix}{data.name}")
 
-        if ignore_index is None: ignore_index = self._config.ignore_index
-        if include_zeroes is None: include_zeroes = self._config.include_zeroes
+        if ignore_index is None:
+            ignore_index = self._config.ignore_index
+        if include_zeroes is None:
+            include_zeroes = self._config.include_zeroes
         if self._config.min_similarity > 0 or not include_zeroes:
             matches_list = self._matches_list
         elif include_zeroes:
             # Here's a fix to a bug pointed out by one GitHub user (@nbcvijanovic):
-            # the fix includes zero-similarity matches that are missing by default 
-            # in _matches_list due to our use of sparse matrices 
+            # the fix includes zero-similarity matches that are missing by default
+            # in _matches_list due to our use of sparse matrices
             non_matches_list = self._get_non_matches_list()
             matches_list = self._matches_list if non_matches_list.empty else \
                 pd.concat([self._matches_list, non_matches_list], axis=0, ignore_index=True)
-            
+
         left_side, right_side = get_both_sides(self._master, self._duplicates, drop_index=ignore_index)
         similarity = matches_list.similarity.reset_index(drop=True)
         if self._master_id is None:
@@ -366,16 +369,18 @@ def get_groups(self,
          If there are IDs (master_id and/or duplicates_id) then the IDs corresponding to the string outputs
          above are returned as well altogether in a DataFrame.
 
-        :param ignore_index: whether or not to exclude string Series index-columns in output.  Defaults to 
+        :param ignore_index: whether or not to exclude string Series index-columns in output.  Defaults to
         self._config.ignore_index.
-        :param replace_na: whether or not to replace NaN values in most similar string index-columns with 
+        :param replace_na: whether or not to replace NaN values in most similar string index-columns with
         corresponding duplicates-index values. Defaults to self._config.replace_na.
          """
-        if ignore_index is None: ignore_index = self._config.ignore_index
+        if ignore_index is None:
+            ignore_index = self._config.ignore_index
         if self._duplicates is None:
             return self._deduplicate(ignore_index=ignore_index)
         else:
-            if replace_na is None: replace_na = self._config.replace_na
+            if replace_na is None:
+                replace_na = self._config.replace_na
             return self._get_nearest_matches(ignore_index=ignore_index, replace_na=replace_na)
 
     @validate_is_fit
@@ -445,7 +450,7 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix
         """Builds the cossine similarity matrix of two csr matrices"""
         tf_idf_matrix_1 = master_matrix
         tf_idf_matrix_2 = duplicate_matrix.transpose()
-        
+
         optional_kwargs = {
             'return_best_ntop': True,
             'use_threads': self._config.number_of_processes > 1,
@@ -465,7 +470,8 @@ def _get_non_matches_list(self) -> pd.DataFrame:
         all_pairs = pd.MultiIndex.from_product([range(m_sz), range(d_sz)], names=['master_side', 'dupe_side'])
         matched_pairs = pd.MultiIndex.from_frame(self._matches_list[['master_side', 'dupe_side']])
         missing_pairs = all_pairs.difference(matched_pairs)
-        if missing_pairs.empty: return pd.DataFrame()
+        if missing_pairs.empty:
+            return pd.DataFrame()
         if (self._max_n_matches < self._true_max_n_matches):
             raise Exception(f'\nERROR: Cannot return zero-similarity matches since \n'
                             f'\t\t max_n_matches={self._max_n_matches} is too small!\n'
@@ -483,8 +489,8 @@ def _get_nearest_matches(self,
         master_label = f'{prefix}{self._master.name if self._master.name else DEFAULT_MASTER_NAME}'
         master = self._master.rename(master_label).reset_index(drop=ignore_index)
         dupes = self._duplicates.rename('duplicates').reset_index(drop=ignore_index)
-        
-        # Rename new master-columns to avoid possible conflict with new dupes-columns when later merging 
+
+        # Rename new master-columns to avoid possible conflict with new dupes-columns when later merging
         if isinstance(dupes, pd.DataFrame):
             master.rename(
                 columns={col: f'{prefix}{col}' for col in master.columns if str(col) != master_label},
@@ -514,14 +520,14 @@ def _get_nearest_matches(self,
         if self._master_id is not None:
             # Also update the master_id-series with the duplicates_id in cases were there is no match
             dupes_max_sim.loc[rows_to_update, master_id_label] = dupes_max_sim[rows_to_update].duplicates_id
-            
+
             # For some weird reason, pandas' merge function changes int-datatype columns to float when NaN values
             # appear within them. So here we change them back to their original datatypes if possible:
             if dupes_max_sim[master_id_label].dtype != self._master_id.dtype and \
-                self._duplicates_id.dtype == self._master_id.dtype:
+                    self._duplicates_id.dtype == self._master_id.dtype:
                 dupes_max_sim.loc[:, master_id_label] = \
-                dupes_max_sim.loc[:, master_id_label].astype(self._master_id.dtype)
-            
+                    dupes_max_sim.loc[:, master_id_label].astype(self._master_id.dtype)
+
         # Prepare the output:
         required_column_list = [master_label] if self._master_id is None else [master_id_label, master_label]
         index_column_list = \
@@ -531,13 +537,13 @@ def _get_nearest_matches(self,
             # Update the master index-columns with the duplicates index-column values in cases were there is no match
             dupes_index_columns = [col for col in dupes.columns if str(col) != 'duplicates']
             dupes_max_sim.loc[rows_to_update, index_column_list] = \
-            dupes_max_sim.loc[rows_to_update, dupes_index_columns].values
-            
+                dupes_max_sim.loc[rows_to_update, dupes_index_columns].values
+
             # Restore their original datatypes if possible:
             for m, d in zip(index_column_list, dupes_index_columns):
                 if dupes_max_sim[m].dtype != master[m].dtype and dupes[d].dtype == master[m].dtype:
                     dupes_max_sim.loc[:, m] = dupes_max_sim.loc[:, m].astype(master[m].dtype)
-                    
+
         # Make sure to keep same order as duplicates
         dupes_max_sim = dupes_max_sim.sort_values('dupe_side').set_index('dupe_side')
         output = dupes_max_sim[index_column_list + required_column_list]
@@ -608,7 +614,7 @@ def _get_indices_of(self, master_side: str, dupe_side: str) -> Tuple[pd.Series,
         master_indices = master_strings[master_strings == master_side].index.to_series().reset_index(drop=True)
         dupe_indices = dupe_strings[dupe_strings == dupe_side].index.to_series().reset_index(drop=True)
         return master_indices, dupe_indices
-    
+
     def _validate_group_rep_specs(self):
         group_rep_options = (GROUP_REP_FIRST, GROUP_REP_CENTROID)
         if self._config.group_rep not in group_rep_options:
@@ -626,16 +632,16 @@ def _validate_replace_na_and_drop(self):
             )
 
     @staticmethod
-    def _fix_diagonal(A) -> csr_matrix:
-        r = np.arange(A.shape[0])
-        A[r, r] = 1
-        return A
+    def _fix_diagonal(m: lil_matrix) -> csr_matrix:
+        r = np.arange(m.shape[0])
+        m[r, r] = 1
+        return m
 
     @staticmethod
-    def _symmetrize_matrix(A) -> csr_matrix:
-        r, c = A.nonzero()
-        A[c, r] = A[r, c]
-        return A
+    def _symmetrize_matrix(m_symmetric: lil_matrix) -> csr_matrix:
+        r, c = m_symmetric.nonzero()
+        m_symmetric[c, r] = m_symmetric[r, c]
+        return m_symmetric
 
     @staticmethod
     def _get_matches_list(matches: csr_matrix) -> pd.DataFrame:
diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py
index 4344177a..2438d679 100644
--- a/string_grouper/test/test_string_grouper.py
+++ b/string_grouper/test/test_string_grouper.py
@@ -9,8 +9,10 @@
     compute_pairwise_similarities
 from unittest.mock import patch
 
-def mock_symmetrize_matrix(A: csr_matrix) -> csr_matrix:
-    return A
+
+def mock_symmetrize_matrix(x: csr_matrix) -> csr_matrix:
+    return x
+
 
 class SimpleExample(object):
     def __init__(self):
@@ -201,13 +203,13 @@ def test_match_strings(self, mock_StringGouper):
         'string_grouper.string_grouper.StringGrouper._symmetrize_matrix',
         side_effect=mock_symmetrize_matrix
     )
-    def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_matrix):
-        """mocks StringGrouper._symmetrize_matches_list so that this test fails whenever _matches_list is 
+    def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_matrix_param):
+        """mocks StringGrouper._symmetrize_matches_list so that this test fails whenever _matches_list is
         **partially** symmetric which often occurs when the kwarg max_n_matches is too small"""
         simple_example = SimpleExample()
         df = simple_example.customers_df2['Customer Name']
         sg = StringGrouper(df, max_n_matches=2).fit()
-        mock_symmetrize_matrix.assert_called_once()
+        mock_symmetrize_matrix_param.assert_called_once()
         # obtain the upper and lower triangular parts of the matrix of matches:
         upper = sg._matches_list[sg._matches_list['master_side'] < sg._matches_list['dupe_side']]
         lower = sg._matches_list[sg._matches_list['master_side'] > sg._matches_list['dupe_side']]
@@ -216,7 +218,7 @@ def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_m
         # obtain the intersection between upper and upper_prime:
         intersection = upper_prime.merge(upper, how='inner', on=['master_side', 'dupe_side'])
         # if the intersection is empty then _matches_list is completely non-symmetric (this is acceptable)
-        # if the intersection is not empty then at least some matches are repeated.  
+        # if the intersection is not empty then at least some matches are repeated.
         # To make sure all (and not just some) matches are repeated, the lengths of
         # upper, upper_prime and their intersection should be identical.
         self.assertFalse(intersection.empty or len(upper) == len(upper_prime) == len(intersection))
@@ -234,7 +236,7 @@ def test_match_list_symmetry_with_symmetrize_function(self):
         # Obtain the intersection between upper and upper_prime:
         intersection = upper_prime.merge(upper, how='inner', on=['master_side', 'dupe_side'])
         # If the intersection is empty this means _matches_list is completely non-symmetric (this is acceptable)
-        # If the intersection is not empty this means at least some matches are repeated.  
+        # If the intersection is not empty this means at least some matches are repeated.
         # To make sure all (and not just some) matches are repeated, the lengths of
         # upper, upper_prime and their intersection should be identical.
         self.assertTrue(intersection.empty or len(upper) == len(upper_prime) == len(intersection))
@@ -267,7 +269,7 @@ def test_match_list_diagonal(self):
         self.assertEqual(num_self_joins, num_strings)
 
     def test_zero_min_similarity(self):
-        """Since sparse matrices exclude zero elements, this test ensures that zero similarity matches are 
+        """Since sparse matrices exclude zero elements, this test ensures that zero similarity matches are
         returned when min_similarity <= 0.  A bug related to this was first pointed out by @nbcvijanovic"""
         simple_example = SimpleExample()
         s_master = simple_example.customers_df['Customer Name']
@@ -276,7 +278,7 @@ def test_zero_min_similarity(self):
         pd.testing.assert_frame_equal(simple_example.expected_result_with_zeroes, matches)
 
     def test_zero_min_similarity_small_max_n_matches(self):
-        """This test ensures that a warning is issued when n_max_matches is suspected to be too small while 
+        """This test ensures that a warning is issued when n_max_matches is suspected to be too small while
         min_similarity <= 0 and include_zeroes is True"""
         simple_example = SimpleExample()
         s_master = simple_example.customers_df['Customer Name']
@@ -665,9 +667,9 @@ def test_get_groups_4_df_same_similarity(self):
         test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
         test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3'])
         test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3'])
-        sg = StringGrouper(test_series_1, 
-                           test_series_2, 
-                           master_id=test_series_id_1, 
+        sg = StringGrouper(test_series_1,
+                           test_series_2,
+                           master_id=test_series_id_1,
                            duplicates_id=test_series_id_2,
                            ignore_index=True)
         sg = sg.fit()
diff --git a/string_grouper_topn/awesome_cossim_topn.py b/string_grouper_topn/awesome_cossim_topn.py
index 4f90ae63..65be44de 100644
--- a/string_grouper_topn/awesome_cossim_topn.py
+++ b/string_grouper_topn/awesome_cossim_topn.py
@@ -2,7 +2,6 @@
 import numpy as np
 from scipy.sparse import csr_matrix
 from scipy.sparse import isspmatrix_csr
-from numpy import indices
 
 if sys.version_info[0] >= 3:
     from string_grouper_topn import sparse_dot_topn as ct
@@ -27,13 +26,13 @@ def awesome_cossim_topn(
         lower_bound: a threshold that the element of A*B must be greater than
         use_threads: use multi-thread or not
         n_jobs: number of thread, must be >= 1
-        return_best_ntop: (default: False) if True, will return best_ntop together 
+        return_best_ntop: (default: False) if True, will return best_ntop together
                           with C as a tuple: (C, best_ntop)
 
     Output:
         C: result matrix (returned alone, if return_best_ntop=False)
-        best_ntop: The true maximum number of elements > lower_bound per row of 
-                   A * B returned together with C as a tuple: (C, best_ntop). It is 
+        best_ntop: The true maximum number of elements > lower_bound per row of
+                   A * B returned together with C as a tuple: (C, best_ntop). It is
                    returned only if return_best_ntop=True.
 
     N.B. if A and B are not in CSR format, they will be converted to CSR
@@ -42,10 +41,11 @@ def try_malloc(sz: int, idx_dtype, data_dtype) -> bool:
         try:
             ind_arr = np.empty(sz, dtype=idx_dtype)
             dat_arr = np.empty(sz, dtype=data_dtype)
+            del ind_arr, dat_arr
             return True
         except MemoryError:
             return False
-        
+
     if not isspmatrix_csr(A):
         A = A.tocsr()
     if not isspmatrix_csr(B):
@@ -74,22 +74,22 @@ def try_malloc(sz: int, idx_dtype, data_dtype) -> bool:
             return output
 
     indptr = np.empty(M + 1, dtype=idx_dtype)
-    
+
     # reduce nnz_max if too large to fit in available memory:
     while (not try_malloc(nnz_max, idx_dtype, A.dtype)):
         nnz_max = nnz_max//2
 
     # take a chance on high matrix-sparsity and reduce further:
     nnz_max = max(M, nnz_max//16)
-    
+
     # filled matrices from here on
     indices = np.empty(nnz_max, dtype=idx_dtype)
     data = np.empty(nnz_max, dtype=A.dtype)
-    
+
     best_ntop_arr = np.full(1, 0, dtype=idx_dtype)
-    
+
     if not use_threads:
-    
+
         alt_indices, alt_data = ct.sparse_dot_topn_extd(
             M, N, np.asarray(A.indptr, dtype=idx_dtype),
             np.asarray(A.indices, dtype=idx_dtype),
@@ -118,15 +118,14 @@ def try_malloc(sz: int, idx_dtype, data_dtype) -> bool:
             lower_bound,
             indptr, indices, data, best_ntop_arr, n_jobs
         )
-    
+
     if alt_indices is not None:
         indices = alt_indices
         data = alt_data
-        
+
     # prepare and return the output:
     output = csr_matrix((data, indices, indptr), shape=(M, N))
     if return_best_ntop:
         return output, best_ntop_arr[0]
     else:
         return output
-
diff --git a/string_grouper_topn/example/comparison.py b/string_grouper_topn/example/comparison.py
index ce3cc0ad..d2d41efc 100644
--- a/string_grouper_topn/example/comparison.py
+++ b/string_grouper_topn/example/comparison.py
@@ -6,7 +6,6 @@
 import timeit
 import numpy as np
 from scipy.sparse import coo_matrix
-from string_grouper_topn import awesome_cossim_topn  # noqa: F401
 
 N = 1000
 thresh = 0.01
@@ -122,7 +121,7 @@ def get_csr_ntop_idx_data(csr_row, ntop):
     return sorted(result, key=lambda x: -x[1])
 
 
-def scipy_cossim_top(A, B, ntop, lower_bound=0):
+def scipy_cossim_top(A, B, ntop):
     C = A.dot(B)
     return [get_csr_ntop_idx_data(row, ntop) for row in C]
 
diff --git a/string_grouper_topn/example/comparison2.py b/string_grouper_topn/example/comparison2.py
index ca4e1fff..75e99461 100644
--- a/string_grouper_topn/example/comparison2.py
+++ b/string_grouper_topn/example/comparison2.py
@@ -8,8 +8,6 @@
 import pandas as pd
 from scipy.sparse import coo_matrix
 from string_grouper_topn import awesome_cossim_topn  # noqa: F401
-from test.sortperf import flush
-from _sqlite3 import Row
 
 df = pd.DataFrame(columns=['sample', '#threads', 'python'])
 
@@ -41,25 +39,25 @@
 r = 0
 for it in range(n_matrix_pairs):
     print('Building matrices ...', end='', flush=True)
-    
+
     row = np.repeat(np.arange(n_samples), int(nr_vocab*density))
-    cols = np.asarray([  rng1.randint(nr_vocab, size=int(nr_vocab*density)) for _ in range(n_samples)  ]).flatten()
+    cols = np.asarray([rng1.randint(nr_vocab, size=int(nr_vocab*density)) for _ in range(n_samples)]).flatten()
     data = rng1.rand(len(row))
-    
+
     a = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab))
     a = a.tocsr()
-    
+
     row = np.repeat(np.arange(n_duplicates), int(nr_vocab*density))
-    cols = np.asarray([  rng1.randint(nr_vocab, size=int(nr_vocab*density)) for _ in range(n_duplicates)  ]).flatten()
+    cols = np.asarray([rng1.randint(nr_vocab, size=int(nr_vocab*density)) for _ in range(n_duplicates)]).flatten()
     data = rng1.rand(len(row))
-    
+
     b = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab))
     b = b.T.tocsr()
-    
+
     del row
     del cols
     del data
-    
+
     print('Finished.', flush=True)
 
     print('Computing matrix product ...', flush=True)
@@ -72,11 +70,11 @@
     del C
     del C_ntop
     print('Finished.', flush=True)
-    
+
     # top 5 results per row
-    
+
     print("Non-parallelized sparse_dot_topn function")
-    
+
     rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh)',
                         number=3,
                         globals=globals())
@@ -84,9 +82,9 @@
     r += 1
     print('sample\t\tpython', flush=True)
     print(f'{it}\t\t{rtv:7.4f}', flush=True)
-    
+
     print("Threaded function with 1 thread")
-    
+
     rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1)',
                         number=3,
                         globals=globals())
@@ -94,9 +92,9 @@
     r += 1
     print('sample\t\tpython', flush=True)
     print(f'{it}\t\t{rtv:7.4f}', flush=True)
-    
+
     print("Threaded function with 2 threads")
-    
+
     rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2)',
                         number=3,
                         globals=globals())
@@ -104,9 +102,9 @@
     r += 1
     print('sample\t\tpython', flush=True)
     print(f'{it}\t\t{rtv:7.4f}', flush=True)
-    
+
     print("Threaded function with 3 threads")
-    
+
     rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3)',
                         number=3,
                         globals=globals())
@@ -114,9 +112,9 @@
     r += 1
     print('sample\t\tpython', flush=True)
     print(f'{it}\t\t{rtv:7.4f}', flush=True)
-    
+
     print("Threaded function with 4 threads")
-    
+
     rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4)',
                         number=3,
                         globals=globals())
@@ -124,9 +122,9 @@
     r += 1
     print('sample\t\tpython', flush=True)
     print(f'{it}\t\t{rtv:7.4f}', flush=True)
-    
+
     print("Threaded function with 5 threads")
-    
+
     rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5)',
                         number=3,
                         globals=globals())
@@ -134,9 +132,9 @@
     r += 1
     print('sample\t\tpython', flush=True)
     print(f'{it}\t\t{rtv:7.4f}', flush=True)
-    
+
     print("Threaded function with 6 threads")
-    
+
     rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6)',
                         number=3,
                         globals=globals())
@@ -144,9 +142,9 @@
     r += 1
     print('sample\t\tpython', flush=True)
     print(f'{it}\t\t{rtv:7.4f}', flush=True)
-    
+
     print("Threaded function with 7 threads")
-    
+
     rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7)',
                         number=3,
                         globals=globals())
@@ -154,7 +152,7 @@
     r += 1
     print('sample\t\tpython', flush=True)
     print(f'{it}\t\t{rtv:7.4f}', flush=True)
-    
+
     print('')
     print(f'nnz(A*B) = {nnz_arr[:(it + 1)].mean()} +/- {nnz_arr[:(it + 1)].std()}')
     print(f'ntop(A*B) = {ntop_arr[:(it + 1)].mean()} +/- {ntop_arr[:(it + 1)].std()}')
@@ -162,7 +160,7 @@
     df = df.astype({
         'sample': np.int64, '#threads': np.int64, 'python': np.float64})
     results = df.groupby('#threads', as_index=True, sort=True)[['python']].mean()
-    
+
     print(results)
     print('')
     print('')
diff --git a/string_grouper_topn/example/comparison3.py b/string_grouper_topn/example/comparison3.py
index 74983dde..de0984f4 100644
--- a/string_grouper_topn/example/comparison3.py
+++ b/string_grouper_topn/example/comparison3.py
@@ -3,10 +3,10 @@
 """
 
 from __future__ import print_function
-import timeit
+# import timeit
 import time
 import numpy as np
-import pandas as pd
+# import pandas as pd
 from scipy.sparse import load_npz
 from string_grouper_topn import awesome_cossim_topn  # noqa: F401
 
@@ -24,8 +24,8 @@
 thresh = 0.8
 
 nr_vocab = b.shape[0]
-density_A = len(a.data)/(a.shape[0]*a.shape[1]) 
-density_B = len(b.data)/(b.shape[0]*b.shape[1]) 
+density_A = len(a.data)/(a.shape[0]*a.shape[1])
+density_B = len(b.data)/(b.shape[0]*b.shape[1])
 n_samples = a.shape[0]
 n_duplicates = b.shape[1]
 nnz_a = len(a.data)
@@ -49,11 +49,10 @@
 it = 0
 
 tic = time.perf_counter()
-C, C_ntop = awesome_cossim_topn(a, b, N, thresh, use_threads=True, n_jobs = 7, return_best_ntop=True)
+C, C_ntop = awesome_cossim_topn(a, b, N, thresh, use_threads=True, n_jobs=7, return_best_ntop=True)
 toc = time.perf_counter()
 
 print('scout_nnz=True, use_threads=True, n_jobs = 7')
 print(f'nnz(A*B) = {len(C.data)}', flush=True)
 print(f'ntop(A*B) = {C_ntop}', flush=True)
 print(f'duration(A*B) = {(toc - tic):0.4f}', flush=True)
-
diff --git a/string_grouper_topn/test/test_awesome_cossim_topn.py b/string_grouper_topn/test/test_awesome_cossim_topn.py
index 80a71431..ffb17915 100644
--- a/string_grouper_topn/test/test_awesome_cossim_topn.py
+++ b/string_grouper_topn/test/test_awesome_cossim_topn.py
@@ -39,7 +39,7 @@ def helper_awesome_cossim_topn_dense(
         b_dense,
         use_threads=False,
         n_jobs=1
-    ):
+        ):
     dense_result = np.dot(a_dense, np.transpose(b_dense))  # dot product
     sparse_result = csr_matrix(dense_result)
     sparse_result_top3 = [get_n_top_sparse(row, NUM_CANDIDATES)
@@ -115,7 +115,7 @@ def helper_awesome_cossim_topn_sparse(
         flag=True,
         use_threads=False,
         n_jobs=1
-    ):
+        ):
     # Note: helper function using awesome_cossim_topn
     sparse_result = a_sparse.dot(b_sparse.T)  # dot product
     sparse_result_top3 = [get_n_top_sparse(row, NUM_CANDIDATES)
diff --git a/string_grouper_utils/string_grouper_utils.py b/string_grouper_utils/string_grouper_utils.py
index a570b377..e674367b 100644
--- a/string_grouper_utils/string_grouper_utils.py
+++ b/string_grouper_utils/string_grouper_utils.py
@@ -137,8 +137,8 @@ def get_column(col: Union[str, int, List[Union[str, int]]], data: pd.DataFrame):
 
 
 def parse_timestamps(timestamps: pd.Series, parserinfo=None, **kwargs) -> pd.Series:
-    error_msg = f"timestamps must be a Series of date-like or datetime-like strings"
-    error_msg += f" or datetime datatype or pandas Timestamp datatype or numbers"
+    error_msg = "timestamps must be a Series of date-like or datetime-like strings"
+    error_msg += " or datetime datatype or pandas Timestamp datatype or numbers"
     if is_series_of_type(str, timestamps):
         # if any of the strings is not datetime-like raise an exception
         if timestamps.to_frame().applymap(is_date).squeeze().all():
diff --git a/string_grouper_utils/test/test_string_grouper_utils.py b/string_grouper_utils/test/test_string_grouper_utils.py
index 3798e3cd..0c8a8ee4 100644
--- a/string_grouper_utils/test/test_string_grouper_utils.py
+++ b/string_grouper_utils/test/test_string_grouper_utils.py
@@ -1,8 +1,8 @@
 import unittest
 import pandas as pd
 from dateutil.parser import parse
-from string_grouper_utils.string_grouper_utils import new_group_rep_by_earliest_timestamp, new_group_rep_by_completeness, \
-    new_group_rep_by_highest_weight
+from string_grouper_utils.string_grouper_utils import new_group_rep_by_earliest_timestamp, \
+    new_group_rep_by_completeness, new_group_rep_by_highest_weight
 
 
 class SimpleExample(object):

From 32d7136db5467beebc2d0468032b29f248ac2e46 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Thu, 10 Jun 2021 15:39:56 +0200
Subject: [PATCH 24/29] removed string_grouper_topn submodule

---
 setup.py                                      |  79 +--
 string_grouper_topn/__init__.py               |   7 -
 string_grouper_topn/array_wrappers.pxd        |  16 -
 string_grouper_topn/array_wrappers.pyx        |  73 ---
 string_grouper_topn/awesome_cossim_topn.py    | 132 ----
 string_grouper_topn/example/comparison.py     | 136 -----
 string_grouper_topn/example/comparison2.py    | 166 -----
 string_grouper_topn/example/comparison3.py    |  58 --
 string_grouper_topn/example/example.py        |  14 -
 string_grouper_topn/sparse_dot_topn.pyx       | 261 --------
 .../sparse_dot_topn_parallel.cpp              | 571 ------------------
 .../sparse_dot_topn_parallel.h                |  78 ---
 .../sparse_dot_topn_source.cpp                | 446 --------------
 string_grouper_topn/sparse_dot_topn_source.h  |  80 ---
 .../sparse_dot_topn_threaded.pyx              | 190 ------
 .../test/test_awesome_cossim_topn.py          | 313 ----------
 16 files changed, 8 insertions(+), 2612 deletions(-)
 delete mode 100644 string_grouper_topn/__init__.py
 delete mode 100644 string_grouper_topn/array_wrappers.pxd
 delete mode 100644 string_grouper_topn/array_wrappers.pyx
 delete mode 100644 string_grouper_topn/awesome_cossim_topn.py
 delete mode 100644 string_grouper_topn/example/comparison.py
 delete mode 100644 string_grouper_topn/example/comparison2.py
 delete mode 100644 string_grouper_topn/example/comparison3.py
 delete mode 100644 string_grouper_topn/example/example.py
 delete mode 100644 string_grouper_topn/sparse_dot_topn.pyx
 delete mode 100644 string_grouper_topn/sparse_dot_topn_parallel.cpp
 delete mode 100644 string_grouper_topn/sparse_dot_topn_parallel.h
 delete mode 100644 string_grouper_topn/sparse_dot_topn_source.cpp
 delete mode 100644 string_grouper_topn/sparse_dot_topn_source.h
 delete mode 100644 string_grouper_topn/sparse_dot_topn_threaded.pyx
 delete mode 100644 string_grouper_topn/test/test_awesome_cossim_topn.py

diff --git a/setup.py b/setup.py
index cf333180..f4b5ecb0 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,5 @@
-from setuptools import setup, Extension
+from setuptools import setup
 import pathlib
-import os
 
 # The directory containing this file
 HERE = pathlib.Path(__file__).parent
@@ -8,65 +7,13 @@
 # The text of the README file
 README = (HERE / "README.md").read_text()
 
-# workaround for numpy and Cython install dependency
-# the solution is from https://stackoverflow.com/a/54138355
-def my_build_ext(pars):
-    # import delayed:
-    from setuptools.command.build_ext import build_ext as _build_ext
-    class build_ext(_build_ext):
-        def finalize_options(self):
-            _build_ext.finalize_options(self)
-            # Prevent numpy from thinking it is still in its setup process:
-            __builtins__.__NUMPY_SETUP__ = False
-            import numpy
-            self.include_dirs.append(numpy.get_include())
-
-    #object returned:
-    return build_ext(pars)
-
-if os.name == 'nt':
-    extra_compile_args = ["-Ox"]
-else:
-    extra_compile_args = ['-std=c++0x', '-pthread', '-O3']
-
-array_wrappers_ext = Extension('string_grouper_topn.array_wrappers',
-                         sources=[
-                                    './string_grouper_topn/array_wrappers.pyx',
-                                ],
-                         extra_compile_args=extra_compile_args,
-                         language='c++')
-
-original_ext = Extension('string_grouper_topn.sparse_dot_topn',
-                         sources=[
-                                    './string_grouper_topn/sparse_dot_topn.pyx',
-                                    './string_grouper_topn/sparse_dot_topn_source.cpp',
-                                ],
-                         extra_compile_args=extra_compile_args,
-                         define_macros=[('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')],
-                         language='c++')
-
-threaded_ext = Extension('string_grouper_topn.sparse_dot_topn_threaded',
-                         sources=[
-                             './string_grouper_topn/sparse_dot_topn_threaded.pyx',
-                             './string_grouper_topn/sparse_dot_topn_source.cpp',
-                             './string_grouper_topn/sparse_dot_topn_parallel.cpp',
-                            ],
-                         extra_compile_args=extra_compile_args,
-                         define_macros=[('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')],
-                         language='c++')
-
 setup(
     name='string_grouper',
     version='0.4.0',
-    packages=[
-        'string_grouper_topn',
-        'string_grouper',
-        'string_grouper_utils',
-    ],
+    packages=['string_grouper'],
     license='MIT License',
     description='String grouper contains functions to do string matching using TF-IDF and the cossine similarity. '
                 'Based on https://bergvca.github.io/2017/10/14/super-fast-string-matching.html',
-    keywords='cosine-similarity sparse-matrix sparse-graph scipy cython',
     author='Chris van den Berg',
     long_description=README,
     long_description_content_type="text/markdown",
@@ -74,20 +21,10 @@ def finalize_options(self):
     url='https://github.com/Bergvca/string_grouper',
     zip_safe=False,
     python_requires='>3.7',
-    setup_requires=[# Setuptools 18.0 properly handles Cython extensions.
-                    'setuptools>=18.0',
-                    'cython>=0.29.15',
-                    'numpy',
-                    'scipy',
-    ],
-    install_requires=[# Setuptools 18.0 properly handles Cython extensions.
-                      'setuptools>=18.0',
-                      'cython>=0.29.15',
-                      'numpy',
-                      'scipy',
-                      'scikit-learn',
-                      'pandas>=0.25.3',
-    ],
-    cmdclass={'build_ext': my_build_ext},
-    ext_modules=[array_wrappers_ext, original_ext, threaded_ext],
+    install_requires=['pandas>=0.25.3'
+                      , 'scipy'
+                      , 'scikit-learn'
+                      , 'numpy'
+                      , 'sparse_dot_topn>=0.2.6'
+                      ]
 )
diff --git a/string_grouper_topn/__init__.py b/string_grouper_topn/__init__.py
deleted file mode 100644
index b123439e..00000000
--- a/string_grouper_topn/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# flake8: noqa
-import sys
-
-if sys.version_info[0] >= 3:
-    from string_grouper_topn.awesome_cossim_topn import awesome_cossim_topn
-else:
-    from awesome_cossim_topn import awesome_cossim_topn
\ No newline at end of file
diff --git a/string_grouper_topn/array_wrappers.pxd b/string_grouper_topn/array_wrappers.pxd
deleted file mode 100644
index 3af1a3c4..00000000
--- a/string_grouper_topn/array_wrappers.pxd
+++ /dev/null
@@ -1,16 +0,0 @@
-from libcpp.vector cimport vector
-
-# define a Cython array wrapper class to hold a C++ vector of ints, adhering to numpy's buffer protocol:
-cdef class ArrayWrapper_int:
-    cdef int view_count
-    cdef vector[int] vec
-    cdef Py_ssize_t shape[2]
-    cdef Py_ssize_t strides[2]
-
-
-# define a Cython array wrapper class to hold a C++ vector of doubles, adhering to numpy's buffer protocol:
-cdef class ArrayWrapper_double:
-    cdef int view_count
-    cdef vector[double] vec
-    cdef Py_ssize_t shape[2]
-    cdef Py_ssize_t strides[2]
diff --git a/string_grouper_topn/array_wrappers.pyx b/string_grouper_topn/array_wrappers.pyx
deleted file mode 100644
index 18525766..00000000
--- a/string_grouper_topn/array_wrappers.pyx
+++ /dev/null
@@ -1,73 +0,0 @@
-from cpython cimport Py_buffer
-from libcpp.vector cimport vector
-
-# define a Cython array wrapper class to hold a C++ vector of ints, adhering to numpy's buffer protocol:
-cdef class ArrayWrapper_int:
-    # constructor and destructor are fairly unimportant now since
-    # vec will be destroyed automatically.
-
-    def __cinit__(self, vector[int]& data):
-        self.vec.swap(data)
-        self.view_count = 0
-
-    # now implement the buffer protocol for the class
-    # which makes it generally useful to anything that expects an array
-    def __getbuffer__(self, Py_buffer *buffer, int flags):
-        # relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class
-        cdef Py_ssize_t itemsize = sizeof(self.vec[0])
-
-        self.shape[1] = self.vec.size()
-        self.shape[0] = 1
-        self.strides[1] = <Py_ssize_t>(  <char *>&(self.vec[1]) - <char *>&(self.vec[0]))
-        self.strides[0] = self.vec.size() * self.strides[1]
-        buffer.buf = <char *>&(self.vec[0])
-        buffer.format = 'i'
-        buffer.internal = NULL
-        buffer.itemsize = itemsize
-        buffer.len = self.vec.size() * itemsize   # product(shape) * itemsize
-        buffer.ndim = 2
-        buffer.obj = self
-        buffer.readonly = 0
-        buffer.shape = self.shape
-        buffer.strides = self.strides
-        buffer.suboffsets = NULL
-        self.view_count += 1
-
-    def __releasebuffer__(self, Py_buffer *buffer):
-        self.view_count -= 1
-
-
-# define a Cython array wrapper class to hold a C++ vector of doubles, adhering to numpy's buffer protocol:
-cdef class ArrayWrapper_double:
-    # constructor and destructor are fairly unimportant now since
-    # vec will be destroyed automatically.
-
-    def __cinit__(self, vector[double]& data):
-        self.vec.swap(data)
-        self.view_count = 0
-
-    # now implement the buffer protocol for the class
-    # which makes it generally useful to anything that expects an array
-    def __getbuffer__(self, Py_buffer *buffer, int flags):
-        # relevant documentation http://cython.readthedocs.io/en/latest/src/userguide/buffer.html#a-matrix-class
-        cdef Py_ssize_t itemsize = sizeof(self.vec[0])
-
-        self.shape[1] = self.vec.size()
-        self.shape[0] = 1
-        self.strides[1] = <Py_ssize_t>(  <char *>&(self.vec[1]) - <char *>&(self.vec[0]))
-        self.strides[0] = self.vec.size() * self.strides[1]
-        buffer.buf = <char *>&(self.vec[0])
-        buffer.format = 'd'
-        buffer.internal = NULL
-        buffer.itemsize = itemsize
-        buffer.len = self.vec.size() * itemsize   # product(shape) * itemsize
-        buffer.ndim = 2
-        buffer.obj = self
-        buffer.readonly = 0
-        buffer.shape = self.shape
-        buffer.strides = self.strides
-        buffer.suboffsets = NULL
-        self.view_count += 1
-
-    def __releasebuffer__(self, Py_buffer *buffer):
-        self.view_count -= 1
diff --git a/string_grouper_topn/awesome_cossim_topn.py b/string_grouper_topn/awesome_cossim_topn.py
deleted file mode 100644
index 09ee7917..00000000
--- a/string_grouper_topn/awesome_cossim_topn.py
+++ /dev/null
@@ -1,132 +0,0 @@
-import sys
-import numpy as np
-from scipy.sparse import csr_matrix
-from scipy.sparse import isspmatrix_csr
-
-if sys.version_info[0] >= 3:
-    from string_grouper_topn import sparse_dot_topn as ct
-    from string_grouper_topn import sparse_dot_topn_threaded as ct_thread
-else:
-    import sparse_dot_topn as ct
-    import sparse_dot_topn_threaded as ct_thread
-
-
-def awesome_cossim_topn(
-        A, B, ntop, lower_bound=0, use_threads=False, n_jobs=1, return_best_ntop=False):
-    """
-    This function will return a matrix C in CSR format, where
-    C = [sorted top n results > lower_bound for each row of A * B].
-    If return_best_ntop=True then best_ntop
-    (the true maximum number of elements > lower_bound per row of A * B)
-    will also be returned in a tuple together with C as (C, best_ntop).
-
-    Input:
-        A and B: two CSR matrices
-        ntop: top n results
-        lower_bound: a threshold that the element of A*B must be greater than
-        use_threads: use multi-thread or not
-        n_jobs: number of thread, must be >= 1
-        return_best_ntop: (default: False) if True, will return best_ntop together
-                          with C as a tuple: (C, best_ntop)
-
-    Output:
-        C: result matrix (returned alone, if return_best_ntop=False)
-        best_ntop: The true maximum number of elements > lower_bound per row of
-                   A * B returned together with C as a tuple: (C, best_ntop). It is
-                   returned only if return_best_ntop=True.
-
-    N.B. if A and B are not in CSR format, they will be converted to CSR
-    """
-    def try_malloc(sz: int, idx_dtype, data_dtype) -> bool:
-        try:
-            ind_arr = np.empty(sz, dtype=idx_dtype)
-            dat_arr = np.empty(sz, dtype=data_dtype)
-            del ind_arr, dat_arr
-            return True
-        except MemoryError:
-            return False
-
-    if not isspmatrix_csr(A):
-        A = A.tocsr()
-    if not isspmatrix_csr(B):
-        B = B.tocsr()
-
-    M, K1 = A.shape
-    K2, N = B.shape
-
-    if K1 != K2:
-        err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!'
-        raise ValueError(err_str)
-
-    idx_dtype = np.int32
-
-    nnz_max = M*ntop
-
-    # basic check. if A or B are all zeros matrix, return all zero matrix directly
-    if len(A.indices) == 0 or len(B.indices) == 0:
-        indptr = np.zeros(M + 1, dtype=idx_dtype)
-        indices = np.zeros(nnz_max, dtype=idx_dtype)
-        data = np.zeros(nnz_max, dtype=A.dtype)
-        output = csr_matrix((data, indices, indptr), shape=(M, N))
-        if return_best_ntop:
-            return output, 0
-        else:
-            return output
-
-    indptr = np.empty(M + 1, dtype=idx_dtype)
-
-    # reduce nnz_max if too large to fit in available memory:
-    nnz_max = 16*nnz_max
-    while (not try_malloc(nnz_max, idx_dtype, A.dtype)):
-        nnz_max = nnz_max//2
-
-    # take a chance on high matrix-sparsity and reduce further:
-    nnz_max = max(M, nnz_max//16)
-
-    # filled matrices from here on
-    indices = np.empty(nnz_max, dtype=idx_dtype)
-    data = np.empty(nnz_max, dtype=A.dtype)
-
-    best_ntop_arr = np.full(1, 0, dtype=idx_dtype)
-
-    if not use_threads:
-
-        alt_indices, alt_data = ct.sparse_dot_topn_extd(
-            M, N, np.asarray(A.indptr, dtype=idx_dtype),
-            np.asarray(A.indices, dtype=idx_dtype),
-            A.data,
-            np.asarray(B.indptr, dtype=idx_dtype),
-            np.asarray(B.indices, dtype=idx_dtype),
-            B.data,
-            ntop,
-            lower_bound,
-            indptr, indices, data, best_ntop_arr
-        )
-
-    else:
-        if n_jobs < 1:
-            err_str = 'Whenever you select the multi-thread mode, n_job must be greater than or equal to 1!'
-            raise ValueError(err_str)
-
-        alt_indices, alt_data = ct_thread.sparse_dot_topn_extd_threaded(
-            M, N, np.asarray(A.indptr, dtype=idx_dtype),
-            np.asarray(A.indices, dtype=idx_dtype),
-            A.data,
-            np.asarray(B.indptr, dtype=idx_dtype),
-            np.asarray(B.indices, dtype=idx_dtype),
-            B.data,
-            ntop,
-            lower_bound,
-            indptr, indices, data, best_ntop_arr, n_jobs
-        )
-
-    if alt_indices is not None:
-        indices = alt_indices
-        data = alt_data
-
-    # prepare and return the output:
-    output = csr_matrix((data, indices, indptr), shape=(M, N))
-    if return_best_ntop:
-        return output, best_ntop_arr[0]
-    else:
-        return output
diff --git a/string_grouper_topn/example/comparison.py b/string_grouper_topn/example/comparison.py
deleted file mode 100644
index d2d41efc..00000000
--- a/string_grouper_topn/example/comparison.py
+++ /dev/null
@@ -1,136 +0,0 @@
-"""
-This file compare our boosting method with calling scipy+numpy function directly
-"""
-
-from __future__ import print_function
-import timeit
-import numpy as np
-from scipy.sparse import coo_matrix
-
-N = 1000
-thresh = 0.01
-
-nr_vocab = 2 << 24
-density = 1e-6
-n_samples = 1000000
-n_duplicates = 1000000
-nnz_a = int(n_samples * nr_vocab * density)
-nnz_b = int(n_duplicates * nr_vocab * density)
-
-
-print(f'density = {density}', flush=True)
-print(f'nr_vocab = {nr_vocab}', flush=True)
-print(f'n_samples = {n_samples}', flush=True)
-print(f'n_duplicates = {n_duplicates}', flush=True)
-print(f'nnz_a = {nnz_a}', flush=True)
-print(f'nnz_b = {nnz_b}', flush=True)
-print('\n', flush=True)
-
-rng1 = np.random.RandomState(42)
-rng2 = np.random.RandomState(43)
-
-row = rng1.randint(n_samples, size=nnz_a)
-cols = rng2.randint(nr_vocab, size=nnz_a)
-data = rng1.rand(nnz_a)
-
-a_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab))
-a = a_sparse.tocsr()
-
-row = rng1.randint(n_duplicates, size=nnz_b)
-cols = rng2.randint(nr_vocab, size=nnz_b)
-data = rng1.rand(nnz_b)
-
-b_sparse = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab))
-b = b_sparse.T.tocsr()
-
-
-# top 5 results per row
-
-print("Original sparse_dot_topn function")
-
-rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh)',
-                    number=3,
-                    globals=globals())
-print(rtv)
-
-print("Threaded function with 1 thread")
-
-rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1)',
-                    number=3,
-                    globals=globals())
-print(rtv)
-
-print("Threaded function with 2 threads")
-
-rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2)',
-                    number=3,
-                    globals=globals())
-print(rtv)
-
-print("Threaded function with 3 threads")
-
-rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3)',
-                    number=3,
-                    globals=globals())
-print(rtv)
-
-print("Threaded function with 4 threads")
-
-rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4)',
-                    number=3,
-                    globals=globals())
-print(rtv)
-
-print("Threaded function with 5 threads")
-
-rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5)',
-                    number=3,
-                    globals=globals())
-print(rtv)
-
-print("Threaded function with 6 threads")
-
-rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6)',
-                    number=3,
-                    globals=globals())
-print(rtv)
-
-print("Threaded function with 7 threads")
-
-rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7)',
-                    number=3,
-                    globals=globals())
-print(rtv)
-
-# use scipy and numpy function
-
-
-def get_csr_ntop_idx_data(csr_row, ntop):
-    """
-    Get list (row index, score) of the n top matches
-    """
-    nnz = csr_row.getnnz()
-    if nnz == 0:
-        return None
-    elif nnz <= ntop:
-        result = zip(csr_row.indices, csr_row.data)
-    else:
-        arg_idx = np.argpartition(csr_row.data, -ntop)[-ntop:]
-        result = zip(csr_row.indices[arg_idx], csr_row.data[arg_idx])
-
-    return sorted(result, key=lambda x: -x[1])
-
-
-def scipy_cossim_top(A, B, ntop):
-    C = A.dot(B)
-    return [get_csr_ntop_idx_data(row, ntop) for row in C]
-
-# top 5 results per row which element is greater than 2
-
-
-print("Scipy+numpy original function")
-
-rtv = timeit.timeit('scipy_cossim_top(a, b, N, thresh)',
-                    number=3,
-                    globals=globals())
-print(rtv)
diff --git a/string_grouper_topn/example/comparison2.py b/string_grouper_topn/example/comparison2.py
deleted file mode 100644
index 557eedd2..00000000
--- a/string_grouper_topn/example/comparison2.py
+++ /dev/null
@@ -1,166 +0,0 @@
-"""
-This file compare our boosting method with calling scipy+numpy function directly
-"""
-
-from __future__ import print_function
-import timeit
-import numpy as np
-import pandas as pd
-from scipy.sparse import coo_matrix
-from string_grouper_topn import awesome_cossim_topn  # noqa: F401
-
-df = pd.DataFrame(columns=['sample', '#threads', 'python'])
-
-N = 4000
-thresh = 0.01
-
-nr_vocab = int(26**3)
-density = 30/nr_vocab
-n_samples = 1000000
-n_duplicates = N
-nnz_a = int(n_samples * nr_vocab * density)
-nnz_b = int(n_duplicates * nr_vocab * density)
-
-print(f'ntop = {N}', flush=True)
-print(f'threshold = {thresh}', flush=True)
-print(f'density = {density}', flush=True)
-print(f'nr_vocab = {nr_vocab}', flush=True)
-print(f'n_samples = {n_samples}', flush=True)
-print(f'n_duplicates = {n_duplicates}', flush=True)
-print(f'nnz_A = {nnz_a}', flush=True)
-print(f'nnz_B = {nnz_b}', flush=True)
-print('', flush=True)
-
-rng1 = np.random.RandomState(42)
-
-n_matrix_pairs = 2**4
-nnz_arr = np.full(n_matrix_pairs, 0)
-ntop_arr = np.full(n_matrix_pairs, 0)
-r = 0
-for it in range(n_matrix_pairs):
-    print('Building matrices ...', end='', flush=True)
-
-    row = np.repeat(np.arange(n_samples), int(nr_vocab*density))
-    cols = np.asarray([rng1.randint(nr_vocab, size=int(nr_vocab*density)) for _ in range(n_samples)]).flatten()
-    data = rng1.rand(len(row))
-
-    a = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab))
-    a = a.tocsr()
-
-    row = np.repeat(np.arange(n_duplicates), int(nr_vocab*density))
-    cols = np.asarray([rng1.randint(nr_vocab, size=int(nr_vocab*density)) for _ in range(n_duplicates)]).flatten()
-    data = rng1.rand(len(row))
-
-    b = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab))
-    b = b.T.tocsr()
-
-    del row
-    del cols
-    del data
-
-    print('Finished.', flush=True)
-
-    print('Computing matrix product ...', flush=True)
-    C, C_ntop = awesome_cossim_topn(a, b, N, thresh, return_best_ntop=True, use_threads=True, n_jobs=4)
-    print(f'nnz(A*B) = {len(C.data)}', flush=True)
-    print(f'ntop(A*B) = {C_ntop}', flush=True)
-    print('', flush=True)
-    nnz_arr[it] = len(C.data)
-    ntop_arr[it] = C_ntop
-    del C
-    del C_ntop
-    print('Finished.', flush=True)
-
-    # top 5 results per row
-
-    print("Non-parallelized sparse_dot_topn function")
-
-    rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh)',
-                        number=3,
-                        globals=globals())
-    df.loc[r] = [it, 0, rtv]
-    r += 1
-    print('sample\t\tpython', flush=True)
-    print(f'{it}\t\t{rtv:7.4f}', flush=True)
-
-    print("Threaded function with 1 thread")
-
-    rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1)',
-                        number=3,
-                        globals=globals())
-    df.loc[r] = [it, 1, rtv]
-    r += 1
-    print('sample\t\tpython', flush=True)
-    print(f'{it}\t\t{rtv:7.4f}', flush=True)
-
-    print("Threaded function with 2 threads")
-
-    rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2)',
-                        number=3,
-                        globals=globals())
-    df.loc[r] = [it, 2, rtv]
-    r += 1
-    print('sample\t\tpython', flush=True)
-    print(f'{it}\t\t{rtv:7.4f}', flush=True)
-
-    print("Threaded function with 3 threads")
-
-    rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3)',
-                        number=3,
-                        globals=globals())
-    df.loc[r] = [it, 3, rtv]
-    r += 1
-    print('sample\t\tpython', flush=True)
-    print(f'{it}\t\t{rtv:7.4f}', flush=True)
-
-    print("Threaded function with 4 threads")
-
-    rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4)',
-                        number=3,
-                        globals=globals())
-    df.loc[r] = [it, 4, rtv]
-    r += 1
-    print('sample\t\tpython', flush=True)
-    print(f'{it}\t\t{rtv:7.4f}', flush=True)
-
-    print("Threaded function with 5 threads")
-
-    rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5)',
-                        number=3,
-                        globals=globals())
-    df.loc[r] = [it, 5, rtv]
-    r += 1
-    print('sample\t\tpython', flush=True)
-    print(f'{it}\t\t{rtv:7.4f}', flush=True)
-
-    print("Threaded function with 6 threads")
-
-    rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6)',
-                        number=3,
-                        globals=globals())
-    df.loc[r] = [it, 6, rtv]
-    r += 1
-    print('sample\t\tpython', flush=True)
-    print(f'{it}\t\t{rtv:7.4f}', flush=True)
-
-    print("Threaded function with 7 threads")
-
-    rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7)',
-                        number=3,
-                        globals=globals())
-    df.loc[r] = [it, 7, rtv]
-    r += 1
-    print('sample\t\tpython', flush=True)
-    print(f'{it}\t\t{rtv:7.4f}', flush=True)
-
-    print('')
-    print(f'nnz(A*B) = {nnz_arr[:(it + 1)].mean()} +/- {nnz_arr[:(it + 1)].std()}')
-    print(f'ntop(A*B) = {ntop_arr[:(it + 1)].mean()} +/- {ntop_arr[:(it + 1)].std()}')
-    print('')
-    df = df.astype({
-        'sample': np.int64, '#threads': np.int64, 'python': np.float64})
-    results = df.groupby('#threads', as_index=True, sort=True)[['python']].mean()
-
-    print(results)
-    print('')
-    print('')
diff --git a/string_grouper_topn/example/comparison3.py b/string_grouper_topn/example/comparison3.py
deleted file mode 100644
index de0984f4..00000000
--- a/string_grouper_topn/example/comparison3.py
+++ /dev/null
@@ -1,58 +0,0 @@
-"""
-This file compare our boosting method with calling scipy+numpy function directly
-"""
-
-from __future__ import print_function
-# import timeit
-import time
-import numpy as np
-# import pandas as pd
-from scipy.sparse import load_npz
-from string_grouper_topn import awesome_cossim_topn  # noqa: F401
-
-a = load_npz('sparse_matrix_A.npz')
-b = load_npz('sparse_matrix_B.npz')
-
-# tic = time.perf_counter()
-# p = np.random.permutation(a.shape[0])
-# a = a[p]
-# toc = time.perf_counter()
-# print(f'shuffle(A) took {(toc - tic):0.4f} seconds', flush=True)
-
-
-N = b.shape[1]
-thresh = 0.8
-
-nr_vocab = b.shape[0]
-density_A = len(a.data)/(a.shape[0]*a.shape[1])
-density_B = len(b.data)/(b.shape[0]*b.shape[1])
-n_samples = a.shape[0]
-n_duplicates = b.shape[1]
-nnz_a = len(a.data)
-nnz_b = len(b.data)
-
-print(f'ntop = {N}', flush=True)
-print(f'threshold = {thresh}', flush=True)
-print(f'density(A) = {density_A}', flush=True)
-print(f'density(B) = {density_B}', flush=True)
-print(f'nr_vocab = {nr_vocab}', flush=True)
-print(f'n_samples = {n_samples}', flush=True)
-print(f'n_duplicates = {n_duplicates}', flush=True)
-print(f'nnz_A = {nnz_a}', flush=True)
-print(f'nnz_B = {nnz_b}', flush=True)
-print('', flush=True)
-
-n_matrix_pairs = 1
-nnz_arr = np.full(n_matrix_pairs, 0)
-ntop_arr = np.full(n_matrix_pairs, 0)
-r = 0
-it = 0
-
-tic = time.perf_counter()
-C, C_ntop = awesome_cossim_topn(a, b, N, thresh, use_threads=True, n_jobs=7, return_best_ntop=True)
-toc = time.perf_counter()
-
-print('scout_nnz=True, use_threads=True, n_jobs = 7')
-print(f'nnz(A*B) = {len(C.data)}', flush=True)
-print(f'ntop(A*B) = {C_ntop}', flush=True)
-print(f'duration(A*B) = {(toc - tic):0.4f}', flush=True)
diff --git a/string_grouper_topn/example/example.py b/string_grouper_topn/example/example.py
deleted file mode 100644
index a403d3ab..00000000
--- a/string_grouper_topn/example/example.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from scipy.sparse import rand
-from string_grouper_topn import awesome_cossim_topn
-
-N = 10
-a = rand(100, 1000000, density=0.005, format='csr')
-b = rand(1000000, 200, density=0.005, format='csr')
-
-# Use standard implementation
-
-c = awesome_cossim_topn(a, b, 5, 0.01)
-
-# Use parallel implementation with 4 threads
-
-d = awesome_cossim_topn(a, b, 5, 0.01, use_threads=True, n_jobs=4)
diff --git a/string_grouper_topn/sparse_dot_topn.pyx b/string_grouper_topn/sparse_dot_topn.pyx
deleted file mode 100644
index 974b4ce9..00000000
--- a/string_grouper_topn/sparse_dot_topn.pyx
+++ /dev/null
@@ -1,261 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at#
-#	http://www.apache.org/licenses/LICENSE-2.0#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Author: Zhe Sun, Ahmet Erdem
-# April 20, 2017
-# Modified by: Particular Miner
-# April 14, 2021
-
-# distutils: language = c++
-
-from libcpp.vector cimport vector
-from array_wrappers cimport ArrayWrapper_int, ArrayWrapper_double
-
-cimport numpy as np
-import numpy as np
-
-np.import_array()
-
-
-cdef extern from "sparse_dot_topn_source.h":
-
-	cdef void sparse_dot_topn_source(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		double Ax[],
-		int Bp[],
-		int Bj[],
-		double Bx[],
-		int topn,
-		double lower_bound,
-		int Cp[],
-		int Cj[],
-		double Cx[]
-	);
-
-	cdef int sparse_dot_topn_extd_source(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		double Ax[],
-		int Bp[],
-		int Bj[],
-		double Bx[],
-		int topn,
-		double lower_bound,
-		int Cp[],
-		int Cj[],
-		double Cx[],
-		vector[int]* alt_Cj,
-		vector[double]* alt_Cx,
-		int nnz_max,
-		int* nminmax
-	);
-
-	cdef int sparse_dot_only_nnz_source(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		double Ax[],
-		int Bp[],
-		int Bj[],
-		double Bx[],
-		int ntop,
-		double lower_bound
-	);
-
-cpdef sparse_dot_topn(
-	int n_row,
-	int n_col,
-	np.ndarray[int, ndim=1] a_indptr,
-	np.ndarray[int, ndim=1] a_indices,
-	np.ndarray[double, ndim=1] a_data,
-	np.ndarray[int, ndim=1] b_indptr,
-	np.ndarray[int, ndim=1] b_indices,
-	np.ndarray[double, ndim=1] b_data,
-	int ntop,
-	double lower_bound,
-	np.ndarray[int, ndim=1] c_indptr,
-	np.ndarray[int, ndim=1] c_indices,
-	np.ndarray[double, ndim=1] c_data
-):
-	"""
-	Cython glue function to call sparse_dot_topn C++ implementation
-	This function will return a matrix C in CSR format, where
-	C = [sorted top n results and results > lower_bound for each row of A * B]
-
-	Input:
-		n_row: number of rows of A matrix
-		n_col: number of columns of B matrix
-
-		a_indptr, a_indices, a_data: CSR expression of A matrix
-		b_indptr, b_indices, b_data: CSR expression of B matrix
-
-		ntop: n top results
-		lower_bound: a threshold that the element of A*B must greater than
-
-	Output by reference:
-		c_indptr, c_indices, c_data: CSR expression of C matrix
-
-	N.B. A and B must be CSR format!!!
-		 The type of input numpy array must be aligned with types of C++ function arguments!
-	"""
-
-	cdef int* Ap = &a_indptr[0]
-	cdef int* Aj = &a_indices[0]
-	cdef double* Ax = &a_data[0]
-	cdef int* Bp = &b_indptr[0]
-	cdef int* Bj = &b_indices[0]
-	cdef double* Bx = &b_data[0]
-	cdef int* Cp = &c_indptr[0]
-	cdef int* Cj = &c_indices[0]
-	cdef double* Cx = &c_data[0]
-
-	sparse_dot_topn_source(
-		n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx
-	)
-	return
-
-cpdef sparse_dot_topn_extd(
-	int n_row,
-	int n_col,
-	np.ndarray[int, ndim=1] a_indptr,
-	np.ndarray[int, ndim=1] a_indices,
-	np.ndarray[double, ndim=1] a_data,
-	np.ndarray[int, ndim=1] b_indptr,
-	np.ndarray[int, ndim=1] b_indices,
-	np.ndarray[double, ndim=1] b_data,
-	int ntop,
-	double lower_bound,
-	np.ndarray[int, ndim=1] c_indptr,
-	np.ndarray[int, ndim=1] c_indices,
-	np.ndarray[double, ndim=1] c_data,
-	np.ndarray[int, ndim=1] nminmax
-):
-	"""
-	Cython glue function to call sparse_dot_topn_extd C++
-	implementation.  This function will return a matrix C in CSR
-	format, where
-	C = [sorted top n results > lower_bound for each row of A * B]
-	The maximum number nminmax of elements per row of C (assuming 
-	n = number of columns of B) is also returned.
-
-	Input:
-		n_row: number of rows of A matrix
-		n_col: number of columns of B matrix
-
-		a_indptr, a_indices, a_data: CSR expression of A matrix
-		b_indptr, b_indices, b_data: CSR expression of B matrix
-
-		ntop: n, the number of topmost results > lower_bound for
-			  each row of C
-		lower_bound: a threshold that the element of A*B must
-					 greater than
-
-	Output by reference:
-		c_indptr, c_indices, c_data: CSR expression of matrix C
-		nminmax: The maximum number of elements per row of C 
-				 (assuming ntop = n_col)
-
-	Returned output:
-		c_indices, c_data: CSR expression of matrix C.  These will 
-						be returned instead of output by reference
-						if the preset sizes of c_indices and 
-						c_data are too small to hold all the 
-						results.
-
-	N.B. A and B must be CSR format!!!
-		 The type of input numpy array must be aligned with types
-		 of C++ function arguments!
-	"""
-
-	cdef int* Ap = &a_indptr[0]
-	cdef int* Aj = &a_indices[0]
-	cdef double* Ax = &a_data[0]
-	cdef int* Bp = &b_indptr[0]
-	cdef int* Bj = &b_indices[0]
-	cdef double* Bx = &b_data[0]
-	cdef int* Cp = &c_indptr[0]
-	cdef int* Cj = &c_indices[0]
-	cdef double* Cx = &c_data[0]
-	cdef int* n_minmax = &nminmax[0]
-	
-	cdef nnz_max = len(c_indices)
-	
-	cdef vector[int] vCj;
-	cdef vector[double] vCx;
-
-	cdef int nnz_max_is_too_small = sparse_dot_topn_extd_source(
-		n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, &vCj, &vCx, nnz_max, n_minmax
-	)
-	
-	if nnz_max_is_too_small:
-		
-		c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0)
-		c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0)
-	
-		return c_indices, c_data		
-	
-	else:
-		
-		return None, None
-
-cpdef sparse_dot_only_nnz(
-	int n_row,
-	int n_col,
-	np.ndarray[int, ndim=1] a_indptr,
-	np.ndarray[int, ndim=1] a_indices,
-	np.ndarray[double, ndim=1] a_data,
-	np.ndarray[int, ndim=1] b_indptr,
-	np.ndarray[int, ndim=1] b_indices,
-	np.ndarray[double, ndim=1] b_data,
-	int ntop,
-	double lower_bound
-):
-	"""
-	Cython glue function to call sparse_dot_nnz_only C++ implementation
-	This function will return nnz, the total number of nonzero
-	matrix-components of
-	C = [top n results > lower_bound for each row of A * B].
-
-	Input:
-		a_indptr, a_indices, a_data: CSR expression of A matrix
-		b_indptr, b_indices, b_data: CSR expression of B matrix
-
-		ntop: n, the number of topmost results > lower_bound for 
-			  each row of C
-		lower_bound: a threshold that the element of A*B must 
-					 greater than
-
-	Returned output:
-		nnz: the total number of nonzero matrix-components of C
-
-	N.B. A and B must be CSR format!!!
-		 The type of input numpy array must be aligned with types of C++ function arguments!
-	"""
-
-	cdef int* Ap = &a_indptr[0]
-	cdef int* Aj = &a_indices[0]
-	cdef double* Ax = &a_data[0]
-	cdef int* Bp = &b_indptr[0]
-	cdef int* Bj = &b_indices[0]
-	cdef double* Bx = &b_data[0]
-
-	return sparse_dot_only_nnz_source(
-		n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound
-	)
diff --git a/string_grouper_topn/sparse_dot_topn_parallel.cpp b/string_grouper_topn/sparse_dot_topn_parallel.cpp
deleted file mode 100644
index 0efb7a45..00000000
--- a/string_grouper_topn/sparse_dot_topn_parallel.cpp
+++ /dev/null
@@ -1,571 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *	http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Author: Zhe Sun, Ahmet Erdem
-// April 20, 2017
-// Modified by: Particular Miner
-// April 14, 2021
-
-#include <cmath>
-#include <vector>
-#include <algorithm>
-#include <numeric>
-#include <thread>
-
-#include "./sparse_dot_topn_source.h"
-#include "./sparse_dot_topn_parallel.h"
-
-
-struct job_range_type {int begin; int end;};
-
-void distribute_load(
-		int load_sz,
-		int n_jobs,
-		std::vector<job_range_type> &ranges
-)
-{
-	// share the load among jobs:
-	int equal_job_load_sz = load_sz/n_jobs;
-	int rem = load_sz % n_jobs;
-	ranges.resize(n_jobs);
-
-	int start = 0;
-	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
-
-		ranges[job_nr].begin = start;
-		ranges[job_nr].end = start + equal_job_load_sz + ((job_nr < rem)? 1 : 0);
-		start = ranges[job_nr].end;
-	}
-}
-
-void inner_gather_v2(
-		job_range_type job_range,
-		int Cp[],
-		int Cp_start,
-		int Cj[],
-		double Cx[],
-		std::vector<candidate>* real_candidates,
-		std::vector<int>* row_nnz
-)
-{
-	if (job_range.begin >= job_range.end) return;
-
-	int* nnz_begin = row_nnz->data();
-	int* nnz_end = nnz_begin + row_nnz->size();
-
-	int* Cp_begin = &Cp[job_range.begin + 1];
-
-	(*row_nnz)[0] += Cp_start;
-	std::partial_sum(nnz_begin, nnz_end, Cp_begin);
-
-	candidate* c_begin = real_candidates->data();
-	candidate* c_end = c_begin + real_candidates->size();
-
-	int* Cj_begin = &Cj[Cp_start];
-	double* Cx_begin = &Cx[Cp_start];
-
-	std::transform(c_begin, c_end, Cj_begin, [](candidate c) -> int { return c.index; });
-	std::transform(c_begin, c_end, Cx_begin, [](candidate c) -> double { return c.value; });
-}
-
-void inner_gather_v1(
-		job_range_type job_range,
-		int Cp[],
-		int Cp_start,
-		int vCj_start[],
-		double vCx_start[],
-		std::vector<candidate>* real_candidates,
-		std::vector<int>* row_nnz
-)
-{
-	candidate* c = real_candidates->data();
-	int* vCj_cursor = &vCj_start[Cp_start];
-	double* vCx_cursor = &vCx_start[Cp_start];
-
-	int Cp_i = Cp_start;
-	int* row_nnz_ptr = row_nnz->data();
-
-	for (int i = job_range.begin; i < job_range.end; i++){
-		for (int j = 0; j < (*row_nnz_ptr); j++){
-			*(vCj_cursor++) = c->index;
-			*(vCx_cursor++) = (c++)->value;
-		}
-		Cp_i += *(row_nnz_ptr++);
-		Cp[i + 1] = Cp_i;
-	}
-}
-
-void inner_sparse_dot_topn(
-		job_range_type job_range,
-		int n_col_inner,
-		int ntop_inner,
-		double lower_bound_inner,
-		int Ap_copy[],
-		int Aj_copy[],
-		double Ax_copy[],
-		int Bp_copy[],
-		int Bj_copy[],
-		double Bx_copy[],
-		std::vector<candidate>* real_candidates,
-		std::vector<int>* row_nnz,
-		int* total
-)
-{
-	std::vector<int> next(n_col_inner,-1);
-	std::vector<double> sums(n_col_inner, 0);
-
-	real_candidates->reserve(job_range.end - job_range.begin);
-
-	row_nnz->resize(job_range.end - job_range.begin);
-	int* row_nnz_ptr = row_nnz->data();
-
-	for (int i = job_range.begin; i < job_range.end; i++){
-
-		int head   = -2;
-		int length =  0;
-		size_t sz = real_candidates->size();
-
-		int jj_start = Ap_copy[i];
-		int jj_end   = Ap_copy[i+1];
-
-		for(int jj = jj_start; jj < jj_end; jj++){
-			int j = Aj_copy[jj];
-			double v = Ax_copy[jj]; //value of A in (i,j)
-
-			int kk_start = Bp_copy[j];
-			int kk_end   = Bp_copy[j+1];
-			for(int kk = kk_start; kk < kk_end; kk++){
-				int k = Bj_copy[kk]; //kth column of B in row j
-
-				sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
-
-				if(next[k] == -1){
-					next[k] = head; //keep a linked list, every element points to the next column index
-					head  = k;
-					length++;
-				}
-			}
-		}
-
-		for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
-
-			if(sums[head] > lower_bound_inner){ //append the nonzero elements
-				candidate c;
-				c.index = head;
-				c.value = sums[head];
-				real_candidates->push_back(c);
-			}
-
-			int temp = head;
-			head = next[head]; //iterate over columns
-
-			next[temp] = -1; //clear arrays
-			sums[temp] =  0; //clear arrays
-		}
-
-		int len = (int) (real_candidates->size() - sz);
-
-		candidate* candidate_arr_begin = real_candidates->data() + sz;
-		if (len > ntop_inner){
-			std::partial_sort(
-					candidate_arr_begin,
-					candidate_arr_begin + ntop_inner,
-					candidate_arr_begin + len,
-					candidate_cmp
-			);
-			len = ntop_inner;
-		}
-		else {
-			std::sort(
-					candidate_arr_begin,
-					candidate_arr_begin + len,
-					candidate_cmp
-			);
-		}
-
-		real_candidates->resize(sz + (size_t) len);
-		*(row_nnz_ptr++) = len;
-		(*total) += len;
-	}
-}
-
-void sparse_dot_topn_parallel(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		double Ax[], //data of A
-		int Bp[],
-		int Bj[],
-		double Bx[], //data of B
-		int ntop,
-		double lower_bound,
-		int Cp[],
-		int Cj[],
-		double Cx[],
-		int n_jobs
-)
-{
-	std::vector<job_range_type> job_ranges(n_jobs);
-	distribute_load(n_row, n_jobs, job_ranges);
-
-	std::vector<std::vector<candidate>> real_candidates(n_jobs);
-	std::vector<std::vector<int>> row_nnz(n_jobs);
-
-	// initialize aggregate:
-	std::vector<int> sub_total(n_jobs, 0);
-
-	std::vector<std::thread> thread_list(n_jobs);
-	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
-
-		thread_list[job_nr] = std::thread(
-				inner_sparse_dot_topn,
-				job_ranges[job_nr],
-				n_col, ntop,
-				lower_bound,
-				Ap, Aj, Ax, Bp, Bj, Bx,
-				&real_candidates[job_nr],
-				&row_nnz[job_nr],
-				&sub_total[job_nr]
-		);
-	}
-
-	for (int job_nr = 0; job_nr < n_jobs; job_nr++)
-		thread_list[job_nr].join();
-
-	// gather the results:
-	std::vector<int> nnz_job_starts(n_jobs + 1);
-	nnz_job_starts[0] = 0;
-	partial_sum(sub_total.begin(), sub_total.end(), nnz_job_starts.begin() + 1);
-
-	Cp[0] = 0;
-	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
-
-		thread_list[job_nr] = std::thread(
-				inner_gather_v1,
-				job_ranges[job_nr],
-				Cp,
-				nnz_job_starts[job_nr],
-				Cj,
-				Cx,
-				&real_candidates[job_nr],
-				&row_nnz[job_nr]
-		);
-	}
-
-	for (int job_nr = 0; job_nr < n_jobs; job_nr++)
-		thread_list[job_nr].join();
-}
-
-void inner_sparse_dot_topn_extd(
-		job_range_type job_range,
-		int n_col_inner,
-		int ntop_inner,
-		double lower_bound_inner,
-		int Ap_copy[],
-		int Aj_copy[],
-		double Ax_copy[],
-		int Bp_copy[],
-		int Bj_copy[],
-		double Bx_copy[],
-		std::vector<candidate>* real_candidates,
-		std::vector<int>* row_nnz,
-		int* total,
-		int* n_minmax,
-		int mem_sz_per_row
-)
-{
-	std::vector<int> next(n_col_inner,-1);
-	std::vector<double> sums(n_col_inner, 0);
-
-	real_candidates->reserve(mem_sz_per_row*(job_range.end - job_range.begin));
-
-	row_nnz->resize(job_range.end - job_range.begin);
-	int* row_nnz_ptr = row_nnz->data();
-
-	for(int i = job_range.begin; i < job_range.end; i++){
-
-		int head   = -2;
-		int length =  0;
-		size_t sz = real_candidates->size();
-
-		int jj_start = Ap_copy[i];
-		int jj_end   = Ap_copy[i+1];
-
-		for(int jj = jj_start; jj < jj_end; jj++){
-			int j = Aj_copy[jj];
-			double v = Ax_copy[jj]; //value of A in (i,j)
-
-			int kk_start = Bp_copy[j];
-			int kk_end   = Bp_copy[j+1];
-			for(int kk = kk_start; kk < kk_end; kk++){
-				int k = Bj_copy[kk]; //kth column of B in row j
-
-				sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
-
-				if(next[k] == -1){
-					next[k] = head; //keep a linked list, every element points to the next column index
-					head  = k;
-					length++;
-				}
-			}
-		}
-
-		for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
-
-			if(sums[head] > lower_bound_inner){ //append the nonzero elements
-				candidate c;
-				c.index = head;
-				c.value = sums[head];
-				real_candidates->push_back(c);
-			}
-
-			int temp = head;
-			head = next[head]; //iterate over columns
-
-			next[temp] = -1; //clear arrays
-			sums[temp] =  0; //clear arrays
-		}
-
-		int len = (int) (real_candidates->size() - sz);
-		*n_minmax = (len > *n_minmax)? len : *n_minmax;
-
-		candidate* candidate_arr_begin = real_candidates->data() + sz;
-		if (len > ntop_inner){
-			std::partial_sort(
-					candidate_arr_begin,
-					candidate_arr_begin + ntop_inner,
-					candidate_arr_begin + len,
-					candidate_cmp
-			);
-			len = ntop_inner;
-		}
-		else {
-			std::sort(
-					candidate_arr_begin,
-					candidate_arr_begin + len,
-					candidate_cmp
-			);
-		}
-
-		real_candidates->resize(sz + (size_t) len);
-		*(row_nnz_ptr++) = len;
-		(*total) += len;
-	}
-}
-
-int sparse_dot_topn_extd_parallel(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		double Ax[], //data of A
-		int Bp[],
-		int Bj[],
-		double Bx[], //data of B
-		int ntop,
-		double lower_bound,
-		int Cp[],
-		int Cj[],
-		double Cx[],
-		std::vector<int>* alt_Cj,
-		std::vector<double>* alt_Cx,
-		int nnz_max,
-		int *n_minmax,
-		int n_jobs
-)
-{
-	std::vector<job_range_type> job_ranges(n_jobs);
-	distribute_load(n_row, n_jobs, job_ranges);
-
-	std::vector<std::vector<candidate>> real_candidates(n_jobs);
-	std::vector<std::vector<int>> row_nnz(n_jobs);
-
-	// initialize aggregates:
-	std::vector<int> sub_total(n_jobs, 0);
-	std::vector<int> split_n_minmax(n_jobs, 0);
-
-	int mem_sz_per_row = std::max(1, (int) std::ceil(((double) nnz_max)/((double) n_row)));
-
-	std::vector<std::thread> thread_list(n_jobs);
-
-	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
-
-		thread_list[job_nr] = std::thread(
-				inner_sparse_dot_topn_extd,
-				job_ranges[job_nr],
-				n_col, ntop,
-				lower_bound,
-				Ap, Aj, Ax, Bp, Bj, Bx,
-				&real_candidates[job_nr],
-				&row_nnz[job_nr],
-				&sub_total[job_nr],
-				&split_n_minmax[job_nr],
-				mem_sz_per_row
-		);
-	}
-
-	for (int job_nr = 0; job_nr < n_jobs; job_nr++)
-		thread_list[job_nr].join();
-
-	// gather the results:
-	*n_minmax = *max_element(split_n_minmax.begin(), split_n_minmax.end());
-
-	std::vector<int> nnz_job_starts(n_jobs + 1);
-	nnz_job_starts[0] = 0;
-	partial_sum(sub_total.begin(), sub_total.end(), nnz_job_starts.begin() + 1);
-
-	int* Cj_container;
-	double* Cx_container;
-
-	int total = nnz_job_starts.back();
-	int nnz_max_is_too_small = (nnz_max < total);
-
-	if (nnz_max_is_too_small) {
-		alt_Cj->resize(total);
-		alt_Cx->resize(total);
-		Cj_container = &((*alt_Cj)[0]);
-		Cx_container = &((*alt_Cx)[0]);
-	}
-	else {
-		Cj_container = Cj;
-		Cx_container = Cx;
-	}
-
-	Cp[0] = 0;
-	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
-
-		thread_list[job_nr] = std::thread(
-				inner_gather_v1,
-				job_ranges[job_nr],
-				Cp,
-				nnz_job_starts[job_nr],
-				Cj_container,
-				Cx_container,
-				&real_candidates[job_nr],
-				&row_nnz[job_nr]
-		);
-	}
-
-	for (int job_nr = 0; job_nr < n_jobs; job_nr++)
-		thread_list[job_nr].join();
-
-	return nnz_max_is_too_small;
-}
-
-void inner_sparse_nnz_only(
-		job_range_type job_range,
-		int n_col_inner,
-		int ntop_inner,
-		double lower_bound_inner,
-		int Ap_copy[],
-		int Aj_copy[],
-		double Ax_copy[],
-		int Bp_copy[],
-		int Bj_copy[],
-		double Bx_copy[],
-		int* nnz
-)
-{
-
-	std::vector<int> next(n_col_inner,-1);
-	std::vector<double> sums(n_col_inner, 0);
-
-	for(int i = job_range.begin; i < job_range.end; i++){
-
-		int head   = -2;
-		int length =  0;
-		int candidates_sz = 0;
-
-		int jj_start = Ap_copy[i];
-		int jj_end   = Ap_copy[i + 1];
-
-		for(int jj = jj_start; jj < jj_end; jj++){
-			int j = Aj_copy[jj];
-			double v = Ax_copy[jj]; //value of A in (i,j)
-
-			int kk_start = Bp_copy[j];
-			int kk_end   = Bp_copy[j + 1];
-			for(int kk = kk_start; kk < kk_end; kk++){
-				int k = Bj_copy[kk]; //kth column of B in row j
-
-				sums[k] += v*Bx_copy[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
-
-				if(next[k] == -1){
-					next[k] = head; //keep a linked list, every element points to the next column index
-					head  = k;
-					length++;
-				}
-			}
-		}
-
-		for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
-
-			if(sums[head] > lower_bound_inner) candidates_sz++;
-
-			int temp = head;
-			head = next[head]; //iterate over columns
-
-			next[temp] = -1; //clear arrays
-			sums[temp] =  0; //clear arrays
-		}
-
-		if (candidates_sz > ntop_inner) candidates_sz = ntop_inner;
-
-		(*nnz) += candidates_sz;
-	}
-}
-
-int sparse_dot_only_nnz_parallel(
-	int n_row,
-	int n_col,
-	int Ap[],
-	int Aj[],
-	double Ax[],
-	int Bp[],
-	int Bj[],
-	double Bx[],
-	int ntop,
-	double lower_bound,
-	int n_jobs
-)
-{
-	std::vector<job_range_type> job_row_ranges(n_jobs);
-	distribute_load(n_row, n_jobs, job_row_ranges);
-
-	std::vector<int> split_nnz(n_jobs, 0);
-	std::vector<std::thread> thread_list(n_jobs);
-
-	for (int job_nr = 0; job_nr < n_jobs; job_nr++) {
-
-		thread_list[job_nr] = std::thread (
-				inner_sparse_nnz_only,
-				job_row_ranges[job_nr],
-				n_col,
-				ntop, lower_bound,
-				Ap, Aj, Ax, Bp, Bj, Bx,
-				&split_nnz[job_nr]
-		);
-
-	}
-
-	for (int job_nr = 0; job_nr < n_jobs; job_nr++)
-		thread_list[job_nr].join();
-
-	return std::accumulate(split_nnz.begin(), split_nnz.end(), (int) 0);
-}
-
diff --git a/string_grouper_topn/sparse_dot_topn_parallel.h b/string_grouper_topn/sparse_dot_topn_parallel.h
deleted file mode 100644
index 3aeb11e0..00000000
--- a/string_grouper_topn/sparse_dot_topn_parallel.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Author: Zhe Sun, Ahmet Erdem
-// April 20, 2017
-// Modified by: Particular Miner
-// April 14, 2021
-
-#ifndef UTILS_CPPCLASS_H
-#define UTILS_CPPCLASS_H
-
-extern void sparse_dot_topn_parallel(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		double Ax[],
-		int Bp[],
-		int Bj[],
-		double Bx[],
-		int ntop,
-		double lower_bound,
-		int Cp[],
-		int Cj[],
-		double Cx[],
-		int n_jobs
-);
-
-extern int sparse_dot_topn_extd_parallel(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		double Ax[],
-		int Bp[],
-		int Bj[],
-		double Bx[],
-		int ntop,
-		double lower_bound,
-		int Cp[],
-		int Cj[],
-		double Cx[],
-		std::vector<int>* alt_Cj,
-		std::vector<double>* alt_Cx,
-		int nnz_max,
-		int* n_minmax,
-		int n_jobs
-);
-
-extern int sparse_dot_only_nnz_parallel(
-	int n_row,
-	int n_col,
-	int Ap[],
-	int Aj[],
-	double Ax[],
-	int Bp[],
-	int Bj[],
-	double Bx[],
-	int ntop,
-	double lower_bound,
-	int n_jobs
-);
-
-#endif //UTILS_CPPCLASS_H
diff --git a/string_grouper_topn/sparse_dot_topn_source.cpp b/string_grouper_topn/sparse_dot_topn_source.cpp
deleted file mode 100644
index be987495..00000000
--- a/string_grouper_topn/sparse_dot_topn_source.cpp
+++ /dev/null
@@ -1,446 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *	http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Author: Zhe Sun, Ahmet Erdem
-// April 20, 2017
-// Modified by: Particular Miner
-// April 14, 2021
-
-#include <vector>
-#include <limits>
-#include <algorithm>
-
-#include "./sparse_dot_topn_source.h"
-
-bool candidate_cmp(candidate c_i, candidate c_j) { return (c_i.value > c_j.value); }
-
-/*
-	C++ implementation of sparse_dot_topn
-
-	This function will return a matrix C in CSR format, where
-	C = [sorted top n results > lower_bound for each row of A * B]
-
-	Input:
-		n_row: number of rows of A matrix
-		n_col: number of columns of B matrix
-
-		Ap, Aj, Ax: CSR expression of A matrix
-		Bp, Bj, Bx: CSR expression of B matrix
-
-		ntop: n top results
-		lower_bound: a threshold that the element of A*B must greater than
-
-	Output by reference:
-		Cp, Cj, Cx: CSR expression of C matrix
-
-	N.B. A and B must be CSR format!!!
-*/
-void sparse_dot_topn_source(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		double Ax[], //data of A
-		int Bp[],
-		int Bj[],
-		double Bx[], //data of B
-		int ntop,
-		double lower_bound,
-		int Cp[],
-		int Cj[],
-		double Cx[]
-)
-{
-	std::vector<int> next(n_col,-1);
-	std::vector<double> sums(n_col, 0);
-
-	std::vector<candidate> candidates;
-
-	int nnz = 0;
-
-	Cp[0] = 0;
-
-	for(int i = 0; i < n_row; i++){
-		int head   = -2;
-		int length =  0;
-
-		int jj_start = Ap[i];
-		int jj_end   = Ap[i+1];
-		for(int jj = jj_start; jj < jj_end; jj++){
-			int j = Aj[jj];
-			double v = Ax[jj]; //value of A in (i,j)
-
-			int kk_start = Bp[j];
-			int kk_end   = Bp[j+1];
-			for(int kk = kk_start; kk < kk_end; kk++){
-				int k = Bj[kk]; //kth column of B in row j
-
-				sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
-
-				if(next[k] == -1){
-					next[k] = head; //keep a linked list, every element points to the next column index
-					head  = k;
-					length++;
-				}
-			}
-		}
-
-		for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
-
-			if(sums[head] > lower_bound){ //append the nonzero elements
-				candidate c;
-				c.index = head;
-				c.value = sums[head];
-				candidates.push_back(c);
-			}
-
-			int temp = head;
-			head = next[head]; //iterate over columns
-
-			next[temp] = -1; //clear arrays
-			sums[temp] =  0; //clear arrays
-		}
-
-		int len = (int)candidates.size();
-		if (len > ntop){
-			std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp);
-			len = ntop;
-		} else {
-			std::sort(candidates.begin(), candidates.end(), candidate_cmp);
-		}
-
-		for(int a=0; a < len; a++){
-			Cj[nnz] = candidates[a].index;
-			Cx[nnz] = candidates[a].value;
-			nnz++;
-		}
-		candidates.clear();
-
-		Cp[i+1] = nnz;
-	}
-}
-
-/*
-	C++ implementation of sparse_dot_topn_extd_source
-
-	This function will return a matrix C in CSR format, where
-	C = [sorted top n results > lower_bound for each row of A * B].
-	The maximum number n_minmax of elements per row of C (assuming ntop = n_col)
-	is also returned.
-
-	Input:
-		n_row: number of rows of A matrix
-		n_col: number of columns of B matrix
-
-		Ap, Aj, Ax: CSR expression of A matrix
-		Bp, Bj, Bx: CSR expression of B matrix
-
-		ntop: n top results
-		lower_bound: a threshold that the element of A*B must greater than
-		nnz_max: the size of the memory allocated for the results Cj and Cx.  If
-				nnz_max is found to be too small during the computation, then the
-				results will be placed in vectors alt_Cj and alt_Cx instead
-
-	Output by reference:
-		Cp, Cj, Cx: CSR expression of C matrix
-		n_minmax: The maximum number of elements per row of C (assuming ntop = n_col)
-				alt_Cj, alt_Cx: CSR expression of C matrix as vectors.  These will
-				contain the output only if nnz_max is found to be too small
-
-	Returned output:
-		nnz_max_is_too_small: int 1 or 0 depending on whether nnz_max was found to be
-							too small or not respectively
-	N.B. A and B must be CSR format!!!
-*/
-int sparse_dot_topn_extd_source(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		double Ax[],	//data of A
-		int Bp[],
-		int Bj[],
-		double Bx[],	//data of B
-		int ntop,
-		double lower_bound,
-		int Cp[],
-		int Cj[],
-		double Cx[], 	//data of C
-		std::vector<int>* alt_Cj,
-		std::vector<double>* alt_Cx,
-		int nnz_max,
-		int* n_minmax
-)
-{
-	std::vector<int> next(n_col,-1);
-	std::vector<double> sums(n_col, 0);
-
-	std::vector<candidate> candidates;
-	candidates.reserve(n_col);
-
-	int nnz = 0;
-	int nnz_max_is_too_small = 0;
-
-	Cp[0] = 0;
-	*n_minmax = 0;
-
-	for(int i = 0; i < n_row; i++){
-		int head   = -2;
-		int length =  0;
-
-		int jj_start = Ap[i];
-		int jj_end   = Ap[i+1];
-		for(int jj = jj_start; jj < jj_end; jj++){
-			int j = Aj[jj];
-			double v = Ax[jj]; //value of A in (i,j)
-
-			int kk_start = Bp[j];
-			int kk_end   = Bp[j+1];
-			for(int kk = kk_start; kk < kk_end; kk++){
-				int k = Bj[kk]; //kth column of B in row j
-
-				sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
-
-				if(next[k] == -1){
-					next[k] = head; //keep a linked list, every element points to the next column index
-					head  = k;
-					length++;
-				}
-			}
-		}
-
-		for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
-
-			if(sums[head] > lower_bound){ //append the nonzero elements
-				candidate c;
-				c.index = head;
-				c.value = sums[head];
-				candidates.push_back(c);
-			}
-
-			int temp = head;
-			head = next[head]; //iterate over columns
-
-			next[temp] = -1; //clear arrays
-			sums[temp] =  0; //clear arrays
-		}
-
-		int len = (int)candidates.size();
-		*n_minmax = (len > *n_minmax)? len : *n_minmax;
-		if (len > ntop){
-			std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp);
-			len = ntop;
-		} else {
-			std::sort(candidates.begin(), candidates.end(), candidate_cmp);
-		}
-		if (len + nnz > nnz_max){
-			if (!nnz_max_is_too_small){
-				nnz_max_is_too_small = true;
-				alt_Cj->resize(nnz);
-				alt_Cx->resize(nnz);
-				std::copy(Cj, Cj + nnz, alt_Cj->data());
-				std::copy(Cx, Cx + nnz, alt_Cx->data());
-			}
-			for(int a = 0; a < len; a++){
-				alt_Cj->push_back(candidates[a].index);
-				alt_Cx->push_back(candidates[a].value);
-				nnz++;
-			}
-		}
-		else {
-			for(int a = 0; a < len; a++){
-				Cj[nnz] = candidates[a].index;
-				Cx[nnz] = candidates[a].value;
-				nnz++;
-			}
-		}
-		candidates.clear();
-
-		Cp[i+1] = nnz;
-	}
-	return nnz_max_is_too_small;
-}
-
-/*
-	C++ implementation of sparse_dot_nnz_source
-
-	This function will return the number nnz of nonzero elements
-	of the matrix C in CSR format, where
-	C = [all results > lower_bound sorted for each row of A * B]
-	and ntop the maximum number of elements per row of C.
-	This function is designed primarily to help with memory management for
-	very large sparse matrices.
-
-	Input:
-		n_row: number of rows of A matrix
-		n_col: number of columns of B matrix
-
-		Ap, Aj, Ax: CSR expression of A matrix
-		Bp, Bj, Bx: CSR expression of B matrix
-
-		lower_bound: a threshold that the element of A*B must greater than
-
-	Output:
-		nnz: number of nonzero elements of matrix C
-		ntop: maximum number of elements per row of C
-
-	N.B. A and B must be CSR format!!!
-*/
-void sparse_dot_nnz_source(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		double Ax[], //data of A
-		int Bp[],
-		int Bj[],
-		double Bx[], //data of B
-		double lower_bound,
-		int* nnz,
-		int* ntop
-)
-{
-	std::vector<int> next(n_col,-1);
-	std::vector<double> sums(n_col, 0);
-
-	*nnz = 0;
-	*ntop = 0;
-
-	for(int i = 0; i < n_row; i++){
-		int head   = -2;
-		int length =  0;
-
-		int jj_start = Ap[i];
-		int jj_end   = Ap[i+1];
-		for(int jj = jj_start; jj < jj_end; jj++){
-			int j = Aj[jj];
-			double v = Ax[jj]; //value of A in (i,j)
-
-			int kk_start = Bp[j];
-			int kk_end   = Bp[j+1];
-			for(int kk = kk_start; kk < kk_end; kk++){
-				int k = Bj[kk]; //kth column of B in row j
-
-				sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
-
-				if(next[k] == -1){
-					next[k] = head; //keep a linked list, every element points to the next column index
-					head  = k;
-					length++;
-				}
-			}
-		}
-
-		int nnz_k = 0;
-		for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
-
-			if(sums[head] > lower_bound) nnz_k++; //count this nonzero element in
-
-			int temp = head;
-			head = next[head]; //iterate over columns
-
-			next[temp] = -1; //clear arrays
-			sums[temp] =  0; //clear arrays
-		}
-		*ntop = (nnz_k > *ntop)? nnz_k : *ntop;
-		*nnz += nnz_k;
-	}
-}
-
-/*
-	C++ implementation of sparse_dot_only_max_nnz_col_source
-
-	This function will return nnz, the total number of nonzero
-	matrix-components of
-	C = [top n results > lower_bound for each row of A * B].
-
-	Input:
-		n_row: number of rows of A matrix
-		n_col: number of columns of B matrix
-
-		Ap, Aj, Ax: CSR expression of A matrix
-		Bp, Bj, Bx: CSR expression of B matrix
-
-		ntop: top n results
-		lower_bound: a threshold that the element of A*B must greater than
-
-	Returned output:
-		nnz: the total number of nonzero matrix-components of C
-
-	N.B. A and B must be CSR format!!!
-*/
-int sparse_dot_only_nnz_source(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		double Ax[], //data of A
-		int Bp[],
-		int Bj[],
-		double Bx[], //data of B
-		int ntop,
-		double lower_bound
-)
-{
-	std::vector<int> next(n_col,-1);
-	std::vector<double> sums(n_col, 0);
-
-	int nnz = 0;
-
-	for(int i = 0; i < n_row; i++){
-		int head   = -2;
-		int length =  0;
-		int candidates_sz = 0;
-
-		int jj_start = Ap[i];
-		int jj_end   = Ap[i+1];
-		for(int jj = jj_start; jj < jj_end; jj++){
-			int j = Aj[jj];
-			double v = Ax[jj]; //value of A in (i,j)
-
-			int kk_start = Bp[j];
-			int kk_end   = Bp[j+1];
-			for(int kk = kk_start; kk < kk_end; kk++){
-				int k = Bj[kk]; //kth column of B in row j
-
-				sums[k] += v*Bx[kk]; //multiply with value of B in (j,k) and accumulate to the result for kth column of row i
-
-				if(next[k] == -1){
-					next[k] = head; //keep a linked list, every element points to the next column index
-					head  = k;
-					length++;
-				}
-			}
-		}
-
-		for(int jj = 0; jj < length; jj++){ //length = number of columns set (may include 0s)
-
-			if(sums[head] > lower_bound) candidates_sz++;
-
-			int temp = head;
-			head = next[head]; //iterate over columns
-
-			next[temp] = -1; //clear arrays
-			sums[temp] =  0; //clear arrays
-		}
-
-		if (candidates_sz > ntop) candidates_sz = ntop;
-
-		nnz += candidates_sz;
-	}
-	return nnz;
-}
diff --git a/string_grouper_topn/sparse_dot_topn_source.h b/string_grouper_topn/sparse_dot_topn_source.h
deleted file mode 100644
index 0ac85127..00000000
--- a/string_grouper_topn/sparse_dot_topn_source.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Author: Zhe Sun, Ahmet Erdem
-// April 20, 2017
-// Modified by: Particular Miner
-// April 14, 2021
-
-#ifndef UTILS_CPPCLASS_H
-#define UTILS_CPPCLASS_H
-
-
-struct candidate {int index; double value;};
-
-extern bool candidate_cmp(candidate c_i, candidate c_j);
-
-extern void sparse_dot_topn_source(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		double Ax[],	//data of A
-		int Bp[],
-		int Bj[],
-		double Bx[],	//data of B
-		int ntop,
-		double lower_bound,
-		int Cp[],
-		int Cj[],
-		double Cx[]		//data of C
-);
-
-extern int sparse_dot_topn_extd_source(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		double Ax[],	//data of A
-		int Bp[],
-		int Bj[],
-		double Bx[],	//data of B
-		int ntop,
-		double lower_bound,
-		int Cp[],
-		int Cj[],
-		double Cx[], 	//data of C
-		std::vector<int>* alt_Cj,
-		std::vector<double>* alt_Cx,
-		int nnz_max,
-		int* n_minmax
-);
-
-extern int sparse_dot_only_nnz_source(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		double Ax[], //data of A
-		int Bp[],
-		int Bj[],
-		double Bx[], //data of B
-		int ntop,
-		double lower_bound
-);
-
-#endif //UTILS_CPPCLASS_H
diff --git a/string_grouper_topn/sparse_dot_topn_threaded.pyx b/string_grouper_topn/sparse_dot_topn_threaded.pyx
deleted file mode 100644
index e20aaaaf..00000000
--- a/string_grouper_topn/sparse_dot_topn_threaded.pyx
+++ /dev/null
@@ -1,190 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at#
-#	http://www.apache.org/licenses/LICENSE-2.0#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Author: Zhe Sun, Ahmet Erdem
-# April 20, 2017
-# Modified by: Particular Miner
-# April 14, 2021
-
-# distutils: language = c++
-
-from libcpp.vector cimport vector
-from array_wrappers cimport ArrayWrapper_int, ArrayWrapper_double
-
-cimport numpy as np
-import numpy as np
-
-
-np.import_array()
-
-
-cdef extern from "sparse_dot_topn_parallel.h":
-
-	cdef void sparse_dot_topn_parallel(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		double Ax[],
-		int Bp[],
-		int Bj[],
-		double Bx[],
-		int topn,
-		double lower_bound,
-		int Cp[],
-		int Cj[],
-		double Cx[],
-		int n_jobs
-	);
-
-	cdef int sparse_dot_topn_extd_parallel(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		double Ax[],
-		int Bp[],
-		int Bj[],
-		double Bx[],
-		int topn,
-		double lower_bound,
-		int Cp[],
-		int Cj[],
-		double Cx[],
-		vector[int]* alt_Cj,
-		vector[double]* alt_Cx,
-		int nnz_max,
-		int* n_minmax,
-		int n_jobs
-	);
-
-	cdef int sparse_dot_only_nnz_parallel(
-		int n_row,
-		int n_col,
-		int Ap[],
-		int Aj[],
-		double Ax[],
-		int Bp[],
-		int Bj[],
-		double Bx[],
-		int ntop,
-		double lower_bound,
-		int n_jobs
-	);
-
-cpdef sparse_dot_topn_threaded(
-	int n_row,
-	int n_col,
-	np.ndarray[int, ndim=1] a_indptr,
-	np.ndarray[int, ndim=1] a_indices,
-	np.ndarray[double, ndim=1] a_data,
-	np.ndarray[int, ndim=1] b_indptr,
-	np.ndarray[int, ndim=1] b_indices,
-	np.ndarray[double, ndim=1] b_data,
-	int ntop,
-	double lower_bound,
-	np.ndarray[int, ndim=1] c_indptr,
-	np.ndarray[int, ndim=1] c_indices,
-	np.ndarray[double, ndim=1] c_data,
-	int n_jobs
-):
-
-	cdef int* Ap = &a_indptr[0]
-	cdef int* Aj = &a_indices[0]
-	cdef double* Ax = &a_data[0]
-	cdef int* Bp = &b_indptr[0]
-	cdef int* Bj = &b_indices[0]
-	cdef double* Bx = &b_data[0]
-	cdef int* Cp = &c_indptr[0]
-	cdef int* Cj = &c_indices[0]
-	cdef double* Cx = &c_data[0]
-
-	sparse_dot_topn_parallel(
-		n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, n_jobs
-	)
-	return
-
-cpdef sparse_dot_topn_extd_threaded(
-	int n_row,
-	int n_col,
-	np.ndarray[int, ndim=1] a_indptr,
-	np.ndarray[int, ndim=1] a_indices,
-	np.ndarray[double, ndim=1] a_data,
-	np.ndarray[int, ndim=1] b_indptr,
-	np.ndarray[int, ndim=1] b_indices,
-	np.ndarray[double, ndim=1] b_data,
-	int ntop,
-	double lower_bound,
-	np.ndarray[int, ndim=1] c_indptr,
-	np.ndarray[int, ndim=1] c_indices,
-	np.ndarray[double, ndim=1] c_data,
-	np.ndarray[int, ndim=1] nminmax,
-	int n_jobs
-):
-
-	cdef int* Ap = &a_indptr[0]
-	cdef int* Aj = &a_indices[0]
-	cdef double* Ax = &a_data[0]
-	cdef int* Bp = &b_indptr[0]
-	cdef int* Bj = &b_indices[0]
-	cdef double* Bx = &b_data[0]
-	cdef int* Cp = &c_indptr[0]
-	cdef int* Cj = &c_indices[0]
-	cdef double* Cx = &c_data[0]
-	cdef int* n_minmax = &nminmax[0]
-	
-	cdef nnz_max = len(c_indices)
-	
-	cdef vector[int] vCj;
-	cdef vector[double] vCx;
-
-	cdef int nnz_max_is_too_small = sparse_dot_topn_extd_parallel(
-		n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, Cj, Cx, &vCj, &vCx, nnz_max, n_minmax, n_jobs
-	)
-	
-	if nnz_max_is_too_small:
-		
-		c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0)
-		c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0)
-	
-		return c_indices, c_data
-	
-	else:
-		
-		return None, None
-
-cpdef sparse_dot_only_nnz_threaded(
-	int n_row,
-	int n_col,
-	np.ndarray[int, ndim=1] a_indptr,
-	np.ndarray[int, ndim=1] a_indices,
-	np.ndarray[double, ndim=1] a_data,
-	np.ndarray[int, ndim=1] b_indptr,
-	np.ndarray[int, ndim=1] b_indices,
-	np.ndarray[double, ndim=1] b_data,
-	int ntop,
-	double lower_bound,
-	int n_jobs
-):
-
-	cdef int* Ap = &a_indptr[0]
-	cdef int* Aj = &a_indices[0]
-	cdef double* Ax = &a_data[0]
-	cdef int* Bp = &b_indptr[0]
-	cdef int* Bj = &b_indices[0]
-	cdef double* Bx = &b_data[0]
-
-	return sparse_dot_only_nnz_parallel(
-		n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, n_jobs
-	)
diff --git a/string_grouper_topn/test/test_awesome_cossim_topn.py b/string_grouper_topn/test/test_awesome_cossim_topn.py
deleted file mode 100644
index ffb17915..00000000
--- a/string_grouper_topn/test/test_awesome_cossim_topn.py
+++ /dev/null
@@ -1,313 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from string_grouper_topn import awesome_cossim_topn
-from scipy.sparse.csr import csr_matrix
-from scipy.sparse import coo_matrix
-from scipy.sparse import rand
-import numpy as np
-import pandas as pd
-import multiprocessing
-import pytest
-
-PRUNE_THRESHOLD = 0.1
-NUM_CANDIDATES = 3
-USE_THREADS = True
-MAX_N_PROCESSES = min(8, multiprocessing.cpu_count()) - 1
-
-
-def get_n_top_sparse(mat, n_top=10):
-    """
-    Get list of (index, value) of the n largest elements in a 1-dimensional sparse matrix
-
-    :param mat: input sparse matrix
-    :param n_top: number of largest elements, default is 10.
-    :return: sorted list of largest elements
-    """
-    length = mat.getnnz()
-    if length == 0:
-        return None
-    if length <= n_top:
-        result = list(zip(mat.indices, mat.data))
-    else:
-        arg_idx = np.argpartition(mat.data, -n_top)[-n_top:]
-        result = list(zip(mat.indices[arg_idx], mat.data[arg_idx]))
-    return sorted(result, key=lambda x: -x[1])
-
-
-def helper_awesome_cossim_topn_dense(
-        a_dense,
-        b_dense,
-        use_threads=False,
-        n_jobs=1
-        ):
-    dense_result = np.dot(a_dense, np.transpose(b_dense))  # dot product
-    sparse_result = csr_matrix(dense_result)
-    sparse_result_top3 = [get_n_top_sparse(row, NUM_CANDIDATES)
-                          for row in sparse_result]  # get ntop using the old method
-
-    pruned_dense_result = dense_result.copy()
-    pruned_dense_result[pruned_dense_result < PRUNE_THRESHOLD] = 0  # prune low similarity
-    pruned_sparse_result = csr_matrix(pruned_dense_result)
-    pruned_sparse_result_top3 = [get_n_top_sparse(row, NUM_CANDIDATES) for row in pruned_sparse_result]
-
-    a_csr = csr_matrix(a_dense)
-    b_csr_t = csr_matrix(b_dense).T
-
-    awesome_result = awesome_cossim_topn(
-        a_csr, b_csr_t, len(b_dense),
-        0.0,
-        use_threads=use_threads,
-        n_jobs=n_jobs
-    )
-    awesome_result_top3 = awesome_cossim_topn(
-        a_csr,
-        b_csr_t,
-        NUM_CANDIDATES,
-        0.0,
-        use_threads=use_threads,
-        n_jobs=n_jobs
-    )
-    awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
-        row.data) > 0 else None for row in awesome_result_top3]  # make comparable, normally not needed
-
-    pruned_awesome_result = awesome_cossim_topn(
-        a_csr,
-        b_csr_t,
-        len(b_dense),
-        PRUNE_THRESHOLD,
-        use_threads=use_threads,
-        n_jobs=n_jobs
-    )
-    pruned_awesome_result_top3 = awesome_cossim_topn(
-        a_csr,
-        b_csr_t,
-        NUM_CANDIDATES,
-        PRUNE_THRESHOLD,
-        use_threads=use_threads,
-        n_jobs=n_jobs
-    )
-    pruned_awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
-        row.data) > 0 else None for row in pruned_awesome_result_top3]
-
-    # no candidate selection, no pruning
-    assert awesome_result.nnz == sparse_result.nnz
-    # no candidate selection, below PRUNE_THRESHOLD similarity pruned
-    assert pruned_awesome_result.nnz == pruned_sparse_result.nnz
-
-    all_none1 = np.all(pd.isnull(awesome_result_top3)) and np.all(pd.isnull(sparse_result_top3))
-    all_none2 = np.all(pd.isnull(pruned_awesome_result_top3)) and np.all(pd.isnull(pruned_sparse_result_top3))
-
-    # top NUM_CANDIDATES candidates selected, no pruning
-    if not all_none1:
-        np.testing.assert_array_almost_equal(awesome_result_top3, sparse_result_top3)
-    else:
-        assert len(awesome_result_top3) == len(sparse_result_top3)
-    # top NUM_CANDIDATES candidates selected, below PRUNE_THRESHOLD similarity pruned
-    if not all_none2:
-        np.testing.assert_array_almost_equal(pruned_awesome_result_top3, pruned_sparse_result_top3)
-    else:
-        assert len(pruned_awesome_result_top3) == len(pruned_sparse_result_top3)
-
-
-def helper_awesome_cossim_topn_sparse(
-        a_sparse,
-        b_sparse,
-        flag=True,
-        use_threads=False,
-        n_jobs=1
-        ):
-    # Note: helper function using awesome_cossim_topn
-    sparse_result = a_sparse.dot(b_sparse.T)  # dot product
-    sparse_result_top3 = [get_n_top_sparse(row, NUM_CANDIDATES)
-                          for row in sparse_result]  # get ntop using the old method
-
-    pruned_sparse_result = sparse_result.copy()
-    pruned_sparse_result[pruned_sparse_result < PRUNE_THRESHOLD] = 0  # prune low similarity
-    pruned_sparse_result.eliminate_zeros()
-    pruned_sparse_result_top3 = [get_n_top_sparse(row, NUM_CANDIDATES) for row in pruned_sparse_result]
-
-    a_csr = csr_matrix(a_sparse)
-    b_csr_t = csr_matrix(b_sparse).T
-
-    awesome_result = awesome_cossim_topn(
-        a_csr,
-        b_csr_t,
-        b_sparse.shape[0],
-        0.0,
-        use_threads=use_threads,
-        n_jobs=n_jobs
-    )
-    awesome_result_top3 = awesome_cossim_topn(
-        a_csr,
-        b_csr_t,
-        NUM_CANDIDATES,
-        0.0,
-        use_threads=use_threads,
-        n_jobs=n_jobs
-    )
-    awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
-        row.data) > 0 else None for row in awesome_result_top3]  # make comparable, normally not needed
-
-    pruned_awesome_result = awesome_cossim_topn(
-        a_csr,
-        b_csr_t,
-        b_sparse.shape[0],
-        PRUNE_THRESHOLD,
-        use_threads=use_threads,
-        n_jobs=n_jobs
-    )
-    pruned_awesome_result_top3 = awesome_cossim_topn(
-        a_csr,
-        b_csr_t,
-        NUM_CANDIDATES,
-        PRUNE_THRESHOLD,
-        use_threads=use_threads,
-        n_jobs=n_jobs
-    )
-    pruned_awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
-        row.data) > 0 else None for row in pruned_awesome_result_top3]
-
-    # no candidate selection, no pruning
-    assert awesome_result.nnz == sparse_result.nnz
-    # no candidate selection, below PRUNE_THRESHOLD similarity pruned
-    assert pruned_awesome_result.nnz == pruned_sparse_result.nnz
-
-    if flag:
-        all_none1 = np.all(pd.isnull(awesome_result_top3)) and np.all(pd.isnull(sparse_result_top3))
-        all_none2 = np.all(pd.isnull(pruned_awesome_result_top3)) and np.all(pd.isnull(pruned_sparse_result_top3))
-
-        # top NUM_CANDIDATES candidates selected, no pruning
-        if not all_none1:
-            np.testing.assert_array_almost_equal(awesome_result_top3, sparse_result_top3)
-        else:
-            assert len(awesome_result_top3) == len(sparse_result_top3)
-        # top NUM_CANDIDATES candidates selected, below PRUNE_THRESHOLD similarity pruned
-        if not all_none2:
-            np.testing.assert_array_almost_equal(pruned_awesome_result_top3, pruned_sparse_result_top3)
-        else:
-            assert len(pruned_awesome_result_top3) == len(pruned_sparse_result_top3)
-    else:
-        assert awesome_result_top3 == sparse_result_top3
-        assert pruned_awesome_result_top3 == pruned_sparse_result_top3
-
-
-def test_awesome_cossim_topn_manually():
-    # a simple case
-    a_dense = [[0.2, 0.1, 0.0, 0.9, 0.3],
-               [0.7, 0.0, 0.0, 0.2, 0.2],
-               [0.0, 0.0, 0.0, 0.2, 0.1],
-               [0.5, 0.4, 0.5, 0.0, 0.0]]
-
-    b_dense = [[0.4, 0.2, 0.3, 0.2, 0.7],
-               [0.9, 0.4, 0.5, 0.1, 0.4],
-               [0.3, 0.8, 0.0, 0.2, 0.5],
-               [0.3, 0.0, 0.1, 0.1, 0.6],
-               [0.6, 0.1, 0.2, 0.8, 0.1],
-               [0.9, 0.1, 0.6, 0.4, 0.3]]
-    helper_awesome_cossim_topn_dense(a_dense, b_dense)
-    for process in range(MAX_N_PROCESSES):
-        n_jobs = process + 1
-        helper_awesome_cossim_topn_dense(a_dense, b_dense, use_threads=USE_THREADS, n_jobs=n_jobs)
-
-    # boundary checking, there is no matching at all in this case
-    c_dense = [[0.2, 0.1, 0.3, 0, 0],
-               [0.7, 0.2, 0.7, 0, 0],
-               [0.3, 0.9, 0.6, 0, 0],
-               [0.5, 0.4, 0.5, 0, 0]]
-    d_dense = [[0, 0, 0, 0.6, 0.9],
-               [0, 0, 0, 0.1, 0.1],
-               [0, 0, 0, 0.2, 0.6],
-               [0, 0, 0, 0.8, 0.4],
-               [0, 0, 0, 0.1, 0.3],
-               [0, 0, 0, 0.7, 0.5]]
-    helper_awesome_cossim_topn_dense(c_dense, d_dense)
-    for process in range(MAX_N_PROCESSES):
-        n_jobs = process + 1
-        helper_awesome_cossim_topn_dense(c_dense, d_dense, use_threads=USE_THREADS, n_jobs=n_jobs)
-
-
-@pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero")
-@pytest.mark.filterwarnings("ignore:Changing the sparsity structure of a csr_matrix is expensive")
-def test_awesome_cossim_top_one_zeros():
-    # test with one row matrix with all zeros
-    # helper_awesome_cossim_top_sparse uses a local function awesome_cossim_top
-    nr_vocab = 1000
-    density = 0.1
-    for _ in range(3):
-        a_sparse = csr_matrix(np.zeros((1, nr_vocab)))
-        b_sparse = rand(800, nr_vocab, density=density, format='csr')
-        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse)
-        for process in range(MAX_N_PROCESSES):
-            n_jobs = process + 1
-            helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, use_threads=USE_THREADS, n_jobs=n_jobs)
-
-
-@pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero")
-@pytest.mark.filterwarnings("ignore:Changing the sparsity structure of a csr_matrix is expensive")
-def test_awesome_cossim_top_all_zeros():
-    # test with all zeros matrix
-    # helper_awesome_cossim_top_sparse uses a local function awesome_cossim_top
-    nr_vocab = 1000
-    density = 0.1
-    for _ in range(3):
-        a_sparse = csr_matrix(np.zeros((2, nr_vocab)))
-        b_sparse = rand(800, nr_vocab, density=density, format='csr')
-        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse)
-        for process in range(MAX_N_PROCESSES):
-            n_jobs = process + 1
-            helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, use_threads=USE_THREADS, n_jobs=n_jobs)
-
-
-@pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero")
-@pytest.mark.filterwarnings("ignore:Changing the sparsity structure of a csr_matrix is expensive")
-def test_awesome_cossim_top_small_matrix():
-    # test with small matrix
-    nr_vocab = 1000
-    density = 0.1
-    for _ in range(10):
-        a_sparse = rand(300, nr_vocab, density=density, format='csr')
-        b_sparse = rand(800, nr_vocab, density=density, format='csr')
-        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False)
-        for process in range(MAX_N_PROCESSES):
-            n_jobs = process + 1
-            helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, use_threads=USE_THREADS, n_jobs=n_jobs)
-
-
-@pytest.mark.filterwarnings("ignore:Comparing a sparse matrix with a scalar greater than zero")
-@pytest.mark.filterwarnings("ignore:Changing the sparsity structure of a csr_matrix is expensive")
-def test_awesome_cossim_top_large_matrix():
-    # MB: I reduced the size of the matrix so the test also runs in small memory.
-    # test with large matrix
-    nr_vocab = 2 << 24
-    density = 1e-6
-    n_samples = 1000
-    nnz = int(n_samples * nr_vocab * density)
-
-    rng1 = np.random.RandomState(42)
-    rng2 = np.random.RandomState(43)
-
-    for _ in range(1):
-        # scipy.sparse.rand has very high memory usage
-        # see for details: https://github.com/scipy/scipy/issues/9699
-        # a_sparse = rand(500, nr_vocab, density=density, format='csr')
-        # b_sparse = rand(80000, nr_vocab, density=density, format='csr')
-
-        # switching to alternative random method below, which is also a lot faster
-        row = rng1.randint(500, size=nnz)
-        cols = rng2.randint(nr_vocab, size=nnz)
-        data = rng1.rand(nnz)
-
-        a_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab))
-        a_sparse = a_sparse.tocsr()
-
-        row = rng1.randint(n_samples, size=nnz)
-        cols = rng2.randint(nr_vocab, size=nnz)
-        data = rng1.rand(nnz)
-
-        b_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab))
-        b_sparse = b_sparse.tocsr()
-
-        helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False)
-        for process in range(MAX_N_PROCESSES):
-            n_jobs = process + 1
-            helper_awesome_cossim_topn_sparse(a_sparse, b_sparse, False, use_threads=USE_THREADS, n_jobs=n_jobs)

From bce1ce7548176f191b02a55f7bdb81256e051810 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Thu, 10 Jun 2021 16:01:45 +0200
Subject: [PATCH 25/29] restored dependency on upgraded package sparse_dot_topn

---
 string_grouper/string_grouper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index 7ebdaa82..7ef70c94 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -7,7 +7,7 @@
 from scipy.sparse.lil import lil_matrix
 from scipy.sparse.csgraph import connected_components
 from typing import Tuple, NamedTuple, List, Optional, Union
-from string_grouper_topn import awesome_cossim_topn
+from sparse_dot_topn import awesome_cossim_topn
 from functools import wraps
 
 DEFAULT_NGRAM_SIZE: int = 3

From 3fd7329df9bddc0826491712450518f7251e67cd Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Thu, 10 Jun 2021 16:10:10 +0200
Subject: [PATCH 26/29] updated GitHub workflow action test script

---
 .github/workflows/test.yml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 5317a62d..d69ee3c5 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -21,11 +21,6 @@ jobs:
       with:
         python-version: ${{ matrix.python-version }}
     
-    - name: Install dev-package
-      run: |
-        python -m pip install --upgrade pip
-        pip install -v -e .
-    
     - name: Run tests
       run: |
         pip install pytest

From 7742fe4730e314c494e347934754e7fefdb8241c Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Thu, 10 Jun 2021 22:09:13 +0200
Subject: [PATCH 27/29] updated dependency on latest version of sparse_dot_topn
 (v 0.3.1)

---
 .github/workflows/test.yml | 5 +++++
 setup.py                   | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index d69ee3c5..5317a62d 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -21,6 +21,11 @@ jobs:
       with:
         python-version: ${{ matrix.python-version }}
     
+    - name: Install dev-package
+      run: |
+        python -m pip install --upgrade pip
+        pip install -v -e .
+    
     - name: Run tests
       run: |
         pip install pytest
diff --git a/setup.py b/setup.py
index f4b5ecb0..4b7dc00a 100644
--- a/setup.py
+++ b/setup.py
@@ -25,6 +25,6 @@
                       , 'scipy'
                       , 'scikit-learn'
                       , 'numpy'
-                      , 'sparse_dot_topn>=0.2.6'
+                      , 'sparse_dot_topn>=0.3.1'
                       ]
 )

From 36f731635ec17dea0dc2435d13151779b13fa606 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Fri, 11 Jun 2021 08:28:46 +0200
Subject: [PATCH 28/29] updated CHANGELOG.md

---
 .github/workflows/test.yml |  4 +---
 CHANGELOG.md               | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 5317a62d..b29917e6 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -27,6 +27,4 @@ jobs:
         pip install -v -e .
     
     - name: Run tests
-      run: |
-        pip install pytest
-        pytest -ra --capture=no --showlocals
+      run: python -m unittest
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d1cb63ff..9ec3c325 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.4.1?] - 2021-06-11
+
+### Added
+[No additions were made]
+
+### Changed
+
+* Changed dependency on `sparse_dot_topn` from version 0.2.9 to 0.3.1
+* Changed the default value of the keyword argument `max_n_matches` from 20 to the number of strings in `duplicates` (or `master`, if 
+`duplicates` is not given). 
+* Changed warning issued when the condition \[`include_zeroes=True` and `min_similarity` &le; 0 and `max_n_matches` is not sufficiently high to capture all nonzero-similarity-matches\] is met to an exception. 
+ 
+### Removed
+
+* Removed the keyword argument `suppress_warning`
+
 ## [0.4.0] - 2021-04-11
 
 ### Added

From 64e4f8597fe3d508e90a4fcecba0228bf8b224f3 Mon Sep 17 00:00:00 2001
From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com>
Date: Fri, 11 Jun 2021 13:40:44 +0200
Subject: [PATCH 29/29] added new keyword argument tfidf_matrix_dtype (the
 datatype for the

tf-idf values of the matrix components). Allowed values are np.float32
and np.float64 (used by sparse_dot_topn v0.3.1).  Default is np.float32:
np.float32 often leads to faster processing but less precision than
np.float64
---
 CHANGELOG.md                               |  7 ++++---
 README.md                                  |  1 +
 string_grouper/string_grouper.py           | 16 +++++++++++++++-
 string_grouper/test/test_string_grouper.py |  7 +++++++
 4 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9ec3c325..ed25a1c0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,13 +10,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [0.4.1?] - 2021-06-11
 
 ### Added
-[No additions were made]
+
+* Added new keyword argument **`tfidf_matrix_dtype`** (the datatype for the tf-idf values of the matrix components). Allowed values are `numpy.float32` and `numpy.float64` (used by the required external package `sparse_dot_topn` version 0.3.1).  Default is `numpy.float32`.  (Note: `numpy.float32` often leads to faster processing and a smaller memory footprint albeit less numerical precision than `numpy.float64`.)
 
 ### Changed
 
 * Changed dependency on `sparse_dot_topn` from version 0.2.9 to 0.3.1
-* Changed the default value of the keyword argument `max_n_matches` from 20 to the number of strings in `duplicates` (or `master`, if 
-`duplicates` is not given). 
+* Changed the default datatype for cosine similarities from numpy.float64 to numpy.float32 to boost computational performance at the expense of numerical precision.
+* Changed the default value of the keyword argument `max_n_matches` from 20 to the number of strings in `duplicates` (or `master`, if `duplicates` is not given). 
 * Changed warning issued when the condition \[`include_zeroes=True` and `min_similarity` &le; 0 and `max_n_matches` is not sufficiently high to capture all nonzero-similarity-matches\] is met to an exception. 
  
 ### Removed
diff --git a/README.md b/README.md
index 6d391ead..1b18c3c9 100644
--- a/README.md
+++ b/README.md
@@ -134,6 +134,7 @@ All functions are built using a class **`StringGrouper`**. This class can be use
    All keyword arguments not mentioned in the function definitions above are used to update the default settings. The following optional arguments can be used:
 
    * **`ngram_size`**: The amount of characters in each n-gram. Default is `3`.
+   * **`tfidf_matrix_dtype`**: The datatype for the tf-idf values of the matrix components. Allowed values are `numpy.float32` and `numpy.float64`.  Default is `numpy.float32`.  (Note: `numpy.float32` often leads to faster processing and a smaller memory footprint albeit less numerical precision than `numpy.float64`.)
    * **`regex`**: The regex string used to clean-up the input string. Default is `"[,-./]|\s"`.
    * **`max_n_matches`**: The maximum number of matches allowed per string in `master`. Default is the number of strings in `duplicates` (or `master`, if `duplicates` is not given).
    * **`min_similarity`**: The minimum cosine similarity for two strings to be considered a match.
diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
index 7ef70c94..d1612511 100644
--- a/string_grouper/string_grouper.py
+++ b/string_grouper/string_grouper.py
@@ -11,6 +11,7 @@
 from functools import wraps
 
 DEFAULT_NGRAM_SIZE: int = 3
+DEFAULT_TFIDF_MATRIX_DTYPE: type = np.float32   # (only types np.float32 and np.float64 are allowed by sparse_dot_topn)
 DEFAULT_REGEX: str = r'[,-./]|\s'
 DEFAULT_MAX_N_MATCHES: int = 20
 DEFAULT_MIN_SIMILARITY: float = 0.8  # minimum cosine similarity for an item to be considered a match
@@ -140,6 +141,10 @@ class StringGrouperConfig(NamedTuple):
     Class with configuration variables.
 
     :param ngram_size: int. The amount of characters in each n-gram. Default is 3.
+    :param tfidf_matrix_dtype: type. The datatype for the tf-idf values of the matrix components.
+    Possible values allowed by sparse_dot_topn are np.float32 and np.float64.  Default is np.float32.
+    (Note: np.float32 often leads to faster processing and a smaller memory footprint albeit less precision
+    than np.float64.)
     :param regex: str. The regex string used to cleanup the input string. Default is '[,-./]|\s'.
     :param max_n_matches: int. The maximum number of matches allowed per string. Default is 20.
     :param min_similarity: float. The minimum cosine similarity for two strings to be considered a match.
@@ -157,6 +162,7 @@ class StringGrouperConfig(NamedTuple):
     """
 
     ngram_size: int = DEFAULT_NGRAM_SIZE
+    tfidf_matrix_dtype: int = DEFAULT_TFIDF_MATRIX_DTYPE
     regex: str = DEFAULT_REGEX
     max_n_matches: Optional[int] = None
     min_similarity: float = DEFAULT_MIN_SIMILARITY
@@ -227,9 +233,10 @@ def __init__(self, master: pd.Series,
             self._max_n_matches = self._config.max_n_matches
 
         self._validate_group_rep_specs()
+        self._validate_tfidf_matrix_dtype()
         self._validate_replace_na_and_drop()
         self.is_build = False  # indicates if the grouper was fit or not
-        self._vectorizer = TfidfVectorizer(min_df=1, analyzer=self.n_grams)
+        self._vectorizer = TfidfVectorizer(min_df=1, analyzer=self.n_grams, dtype=self._config.tfidf_matrix_dtype)
         # After the StringGrouper is built, _matches_list will contain the indices and similarities of the matches
         self._matches_list: pd.DataFrame = pd.DataFrame()
         # _true_max_n_matches will contain the true maximum number of matches over all strings in master if
@@ -622,6 +629,13 @@ def _validate_group_rep_specs(self):
                 f"Invalid option value for group_rep. The only permitted values are\n {group_rep_options}"
             )
 
+    def _validate_tfidf_matrix_dtype(self):
+        dtype_options = (np.float32, np.float64)
+        if self._config.tfidf_matrix_dtype not in dtype_options:
+            raise Exception(
+                f"Invalid option value for tfidf_matrix_dtype. The only permitted values are\n {dtype_options}"
+            )
+
     def _validate_replace_na_and_drop(self):
         if self._config.ignore_index and self._config.replace_na:
             raise Exception("replace_na can only be set to True when ignore_index=False.")
diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py
index 2438d679..f5f0aac8 100644
--- a/string_grouper/test/test_string_grouper.py
+++ b/string_grouper/test/test_string_grouper.py
@@ -133,6 +133,7 @@ def test_compute_pairwise_similarities(self):
             ],
             name='similarity'
         )
+        expected_result = expected_result.astype(np.float32)
         pd.testing.assert_series_equal(expected_result, similarities)
 
     def test_compute_pairwise_similarities_data_integrity(self):
@@ -367,6 +368,7 @@ def test_build_matches_list(self):
         dupe_side = [0, 1]
         similarity = [1.0, 1.0]
         expected_df = pd.DataFrame({'master_side': master, 'dupe_side': dupe_side, 'similarity': similarity})
+        expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
         pd.testing.assert_frame_equal(expected_df, sg._matches_list)
 
     def test_case_insensitive_build_matches_list(self):
@@ -379,6 +381,7 @@ def test_case_insensitive_build_matches_list(self):
         dupe_side = [0, 1]
         similarity = [1.0, 1.0]
         expected_df = pd.DataFrame({'master_side': master, 'dupe_side': dupe_side, 'similarity': similarity})
+        expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
         pd.testing.assert_frame_equal(expected_df, sg._matches_list)
 
     def test_get_matches_two_dataframes(self):
@@ -393,6 +396,7 @@ def test_get_matches_two_dataframes(self):
         expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side,
                                     'similarity': similarity,
                                     'right_side': right_side, 'right_index': right_index})
+        expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
         pd.testing.assert_frame_equal(expected_df, sg.get_matches())
 
     def test_get_matches_single(self):
@@ -407,6 +411,7 @@ def test_get_matches_single(self):
         expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side,
                                     'similarity': similarity,
                                     'right_side': right_side, 'right_index': right_index})
+        expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
         pd.testing.assert_frame_equal(expected_df, sg.get_matches())
 
     def test_get_matches_1_series_1_id_series(self):
@@ -424,6 +429,7 @@ def test_get_matches_1_series_1_id_series(self):
         expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id,
                                     'similarity': similarity,
                                     'right_id': right_side_id, 'right_side': right_side, 'right_index': right_index})
+        expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
         pd.testing.assert_frame_equal(expected_df, sg.get_matches())
 
     def test_get_matches_2_series_2_id_series(self):
@@ -443,6 +449,7 @@ def test_get_matches_2_series_2_id_series(self):
         expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id,
                                     'similarity': similarity,
                                     'right_id': right_side_id, 'right_side': right_side, 'right_index': right_index})
+        expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
         pd.testing.assert_frame_equal(expected_df, sg.get_matches())
 
     def test_get_matches_raises_exception_if_unexpected_options_given(self):