diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..1c96eda --- /dev/null +++ b/.clang-format @@ -0,0 +1,6 @@ +Language: Cpp +BasedOnStyle: Google +AllowShortFunctionsOnASingleLine: None +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +PenaltyBreakComment: 100 diff --git a/.gitignore b/.gitignore index 3da6175..43bc1b8 100644 --- a/.gitignore +++ b/.gitignore @@ -36,7 +36,7 @@ nosetests.xml # Project specific tsne/bh_sne.cpp -tsne/bh_sne_3d.cpp tsne/bh_sne_src/bh_tsne *.pkl.gz .cache +.pytest_cache/ diff --git a/.travis.yml b/.travis.yml index 3c978da..fda95df 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,37 +1,22 @@ language: python python: - - "2.7" - # - "3.4" +- "2.7" +compiler: clang +env: +- CC="clang" CXX="clang++" LDFLAGS="-fuse-ld=lld" +dist: xenial addons: apt: packages: - build-essential - - libatlas-base-dev - -sudo: false + - liblapack3 install: - - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh; - else - wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; - fi - - bash miniconda.sh -b -p $HOME/miniconda - - export PATH="$HOME/miniconda/bin:$PATH" - - conda config --set always_yes yes --set changeps1 no - - conda update -q conda - - conda info -a - - - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION - - source activate test-environment - - conda install numpy cython scipy pytest scikit-learn - - python setup.py install - - source deactivate - + - pip install numpy==1.14.2 cython==0.28.2 pytest==3.5.1 scikit-learn==0.19.1 scipy==1.0.1 script: - pwd - - source activate test-environment + - python setup.py install - cd tsne/tests - py.test -s -vv diff --git a/Makefile b/Makefile index be317d5..a710977 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,39 @@ -build: - python setup.py build_ext --inplace +.PHONY: build -install: - python setup.py build_ext --inplace - python setup.py install +build: setup.py tsne/bh_sne.pyx \ + $(wildcard tsne/bh_sne_src/*.h) \ + $(wildcard tsne/bh_sne_src/*.cpp) + python $< build_ext --inplace -sdist: - python setup.py sdist +PROFILE_LOCATION=$(shell pwd)/profile + +$(PROFILE_LOCATION)/bh_sne.profdata: PROFILE=generate +$(PROFILE_LOCATION)/bh_sne.profdata: setup.py Makefile tsne/bh_sne.pyx \ + $(wildcard tsne/bh_sne_src/*.h) \ + $(wildcard tsne/bh_sne_src/*.cpp) + rm -rf build tsne/*.so *.so $(PROFILE_LOCATION)/*.profraw + PROFILE=generate python $< build_ext --inplace + mkdir -p $(PROFILE_LOCATION) + cd tsne/tests && LLVM_PROFILE_FILE="$(PROFILE_LOCATION)/%p-%m.profraw" py.test + llvm-profdata merge -output=$@ $(PROFILE_LOCATION)/*.profraw + rm -rf tsne/*.so *.so build tsne/*.o + +build_pgo: setup.py $(PROFILE_LOCATION)/bh_sne.profdata + PROFILE="$(PROFILE_LOCATION)/bh_sne.profdata" python $< build_ext --inplace + +install: setup.py build + python $< install + +sdist: setup.py tsne/bh_sne.pyx \ + $(wildcard tsne/bh_sne_src/*.h) \ + $(wildcard tsne/bh_sne_src/*.cpp) + python $< sdist + +test: build + cd tsne/tests && time py.test -s -vv clean: - rm -rf *.pyc *.so build/ bh_sne.cpp - rm -rf tsne/*.pyc tsne/*.so tsne/build/ tsne/bh_sne.cpp + make -C tsne/bh_sne_src clean + rm -rf *.pyc *.so build/ profile/ bh_sne.cpp + rm -rf tsne/*.pyc tsne/*.so tsne/*.o tsne/build/ tsne/bh_sne.cpp + rm -rf .pytest_cache/ diff --git a/README.md b/README.md index c0d577e..169dad0 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,6 @@ Requirements * [numpy](numpy.scipy.org) > =1.7.1 * [scipy](http://www.scipy.org/) >= 0.12.0 * [cython](cython.org) >= 0.19.1 -* [cblas](http://www.netlib.org/blas/) or [openblas](https://github.com/xianyi/OpenBLAS). Tested version is v0.2.5 and v0.2.6 (not necessary for OSX). [Anaconda](http://continuum.io/downloads) is recommended. diff --git a/setup.py b/setup.py index 38cd82b..709dbb0 100644 --- a/setup.py +++ b/setup.py @@ -7,6 +7,7 @@ """ import sys +import os import platform from distutils.core import setup @@ -28,35 +29,45 @@ if v2 >= 10: # More than 10.10 - extra_compile_args=['-I/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/Headers'] + extra_compile_args = [ + '-I/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/Headers'] else: - extra_compile_args=['-I/System/Library/Frameworks/vecLib.framework/Headers'] + extra_compile_args = [ + '-I/System/Library/Frameworks/vecLib.framework/Headers'] ext_modules = [Extension(name='bh_sne', - sources=['tsne/bh_sne_src/sptree.cpp', 'tsne/bh_sne_src/tsne.cpp', 'tsne/bh_sne.pyx'], - include_dirs=[numpy.get_include(), 'tsne/bh_sne_src/'], - extra_compile_args=extra_compile_args + ['-ffast-math', '-O3'], - extra_link_args=['-Wl,-framework', '-Wl,Accelerate', '-lcblas'], - language='c++')] + sources=['tsne/bh_sne_src/sptree.cpp', + 'tsne/bh_sne_src/tsne.cpp', 'tsne/bh_sne.pyx'], + include_dirs=[ + numpy.get_include(), 'tsne/bh_sne_src/'], + extra_compile_args=extra_compile_args + + ['-ffast-math', '-O3', '-std=c++14'], + extra_link_args=['-Wl,-framework', + '-Wl,Accelerate'], + language='c++')] else: # LINUX - + opt_flags = ['-msse3', '-O3', '-flto'] + if 'PROFILE' in os.environ: + if os.environ['PROFILE'] == 'generate': + opt_flags.append('-fprofile-instr-generate') + elif os.path.isfile(os.environ['PROFILE']): + opt_flags.append('-fprofile-instr-use='+os.environ['PROFILE']) + ldflags = list(opt_flags) + if 'LDFLAGS' in os.environ and os.environ['LDFLAGS']: + ldflags.extend(os.environ['LDFLAGS'].split(' ')) + ldflags.extend(['-Wl,-Bdynamic,--as-needed', '-lgcc_s']) ext_modules = [Extension(name='bh_sne', - sources=['tsne/bh_sne_src/sptree.cpp', 'tsne/bh_sne_src/tsne.cpp', 'tsne/bh_sne.pyx'], - include_dirs=[numpy.get_include(), '/usr/local/include', 'tsne/bh_sne_src/'], - library_dirs=['/usr/local/lib', '/usr/lib64/atlas'], - extra_compile_args=['-msse3', '-O3', '-fPIC', '-w', '-ffast-math'], - extra_link_args=['-Wl,-Bdynamic,--as-needed', '-lgcc_s'], - language='c++'), - - Extension(name='bh_sne_3d', - sources=['tsne/bh_sne_src/sptree.cpp', 'tsne/bh_sne_src/tsne.cpp', 'tsne/bh_sne_3d.pyx'], - include_dirs=[numpy.get_include(), '/usr/local/include', 'tsne/bh_sne_src/'], - library_dirs=['/usr/local/lib', '/usr/lib64/atlas'], - extra_compile_args=['-msse3', '-O3', '-fPIC', '-w', '-ffast-math', '-DTSNE3D'], - extra_link_args=['-Wl,-Bdynamic,--as-needed', '-lgcc_s'], - language='c++')] + sources=['tsne/bh_sne_src/sptree.cpp', + 'tsne/bh_sne_src/tsne.cpp', 'tsne/bh_sne.pyx'], + include_dirs=[ + numpy.get_include(), 'tsne/bh_sne_src/'], + extra_compile_args=opt_flags + + ['-Wall', '-fPIC', '-std=c++14', '-w'], + extra_link_args=ldflags, + language='c++'), + ] ext_modules = cythonize(ext_modules) @@ -77,4 +88,4 @@ packages=find_packages(), ext_modules=ext_modules, install_requires=required -) + ) diff --git a/tsne/__init__.py b/tsne/__init__.py index c5b9d3c..a6fcb2b 100644 --- a/tsne/__init__.py +++ b/tsne/__init__.py @@ -1,10 +1,11 @@ # encoding: utf-8 from __future__ import division + import numpy as np import scipy.linalg as la import sys from bh_sne import BH_SNE -from bh_sne_3d import BH_SNE_3D + def bh_sne(data, pca_d=None, d=2, perplexity=30., theta=0.5, random_state=None, copy_data=False, init=None, @@ -79,17 +80,16 @@ def bh_sne(data, pca_d=None, d=2, perplexity=30., theta=0.5, if mom_switch_iter is None: mom_switch_iter = 250 - if d == 2: - tsne = BH_SNE() - elif d == 3: - tsne = BH_SNE_3D() - else: + if d != 2 and d != 3: raise Exception("TSNE dimensions must be 2 or 3") - Y = tsne.run(X, N, X.shape[1], d, perplexity, theta, seed, init=init, use_init=use_init, - max_iter=max_iter, stop_lying_iter=stop_lying_iter, mom_switch_iter=mom_switch_iter) + Y = BH_SNE.run(X, d, + perplexity, theta, + seed, init=init, use_init=use_init, + max_iter=max_iter, stop_lying_iter=stop_lying_iter, mom_switch_iter=mom_switch_iter) return Y + from ._version import get_versions __version__ = get_versions()['version'] del get_versions diff --git a/tsne/bh_sne.pyx b/tsne/bh_sne.pyx index e45dd13..6dada3d 100644 --- a/tsne/bh_sne.pyx +++ b/tsne/bh_sne.pyx @@ -5,24 +5,33 @@ cimport cython from libcpp cimport bool cdef extern from "tsne.h": - cdef cppclass TSNE: - TSNE() - void run(double* X, int N, int D, double* Y, int no_dims, double perplexity, double theta, int rand_seed, bool skip_random_init, double *init, bool use_init, int max_iter, int stop_lying_iter, int mom_switch_iter) + void _c_run "run" ( + double* X, int N, int D, double* Y, + int no_dims, double perplexity, + double theta, int rand_seed, + bool skip_random_init, double *init, bool use_init, + int max_iter, int stop_lying_iter, int mom_switch_iter) nogil cdef class BH_SNE: - cdef TSNE* thisptr # hold a C++ instance - - def __cinit__(self): - self.thisptr = new TSNE() - - def __dealloc__(self): - del self.thisptr - @cython.boundscheck(False) @cython.wraparound(False) - def run(self, X, N, D, d, perplexity, theta, seed, init, use_init, max_iter, stop_lying_iter, mom_switch_iter): - cdef np.ndarray[np.float64_t, ndim=2, mode='c'] _X = np.ascontiguousarray(X) - cdef np.ndarray[np.float64_t, ndim=2, mode='c'] _init = np.ascontiguousarray(init) - cdef np.ndarray[np.float64_t, ndim=2, mode='c'] Y = np.zeros((N, d), dtype=np.float64) - self.thisptr.run(&_X[0,0], N, D, &Y[0,0], d, perplexity, theta, seed, False, &_init[0,0], use_init, max_iter, stop_lying_iter, mom_switch_iter) + @staticmethod + def run(X, int d, + double perplexity, double theta, + int seed, init, bool use_init, + int max_iter, int stop_lying_iter, int mom_switch_iter): + cdef int N = X.shape[0] + cdef int D = X.shape[1] + cdef np.ndarray[np.float64_t, ndim=2, mode='c'] _X = np.ascontiguousarray( + X, dtype=np.float64) + cdef np.ndarray[np.float64_t, ndim=2, mode='c'] _init = np.ascontiguousarray( + init, dtype=np.float64) + cdef np.ndarray[np.float64_t, ndim=2, mode='c'] Y = np.zeros( + (N, d), dtype=np.float64, order='C') + with nogil: + _c_run(&_X[0,0], N, D, &Y[0,0], + d, perplexity, + theta, seed, + False, &_init[0,0], use_init, + max_iter, stop_lying_iter, mom_switch_iter) return Y diff --git a/tsne/bh_sne_3d.pyx b/tsne/bh_sne_3d.pyx deleted file mode 100644 index 315e107..0000000 --- a/tsne/bh_sne_3d.pyx +++ /dev/null @@ -1,28 +0,0 @@ -# distutils: language = c++ -import numpy as np -cimport numpy as np -cimport cython -from libcpp cimport bool - -cdef extern from "tsne.h": - cdef cppclass TSNE: - TSNE() - void run(double* X, int N, int D, double* Y, int no_dims, double perplexity, double theta, int rand_seed, bool skip_random_init, double *init, bool use_init, int max_iter, int stop_lying_iter, int mom_switch_iter) - -cdef class BH_SNE_3D: - cdef TSNE* thisptr # hold a C++ instance - - def __cinit__(self): - self.thisptr = new TSNE() - - def __dealloc__(self): - del self.thisptr - - @cython.boundscheck(False) - @cython.wraparound(False) - def run(self, X, N, D, d, perplexity, theta, seed, init, use_init, max_iter, stop_lying_iter, mom_switch_iter): - cdef np.ndarray[np.float64_t, ndim=2, mode='c'] _X = np.ascontiguousarray(X) - cdef np.ndarray[np.float64_t, ndim=2, mode='c'] _init = np.ascontiguousarray(init) - cdef np.ndarray[np.float64_t, ndim=2, mode='c'] Y = np.zeros((N, d), dtype=np.float64) - self.thisptr.run(&_X[0,0], N, D, &Y[0,0], d, perplexity, theta, seed, False, &_init[0,0], use_init, max_iter, stop_lying_iter, mom_switch_iter) - return Y diff --git a/tsne/bh_sne_src/Makefile b/tsne/bh_sne_src/Makefile index 8c8a1c8..95534dc 100644 --- a/tsne/bh_sne_src/Makefile +++ b/tsne/bh_sne_src/Makefile @@ -2,25 +2,22 @@ #CFLAGS = -march=haswell -ffast-math -O3 -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize #CFLAGS = -march=haswell -ffast-math -O3 -CXX = g++ -CFLAGS = -ffast-math -O3 +CXX?=g++ +CFLAGS?=-ffast-math -O3 -Wall -std=c++14 -all: bh_tsne bh_tsne_3d +all: bh_tsne -bh_tsne: tsne.o sptree.o - $(CXX) $(CFLAGS) tsne.o sptree.o -o bh_tsne - -bh_tsne_3d: tsne_3d.o sptree.o - $(CXX) $(CFLAGS) tsne_3d.o sptree.o -o bh_tsne_3d +bh_tsne: main.o tsne.o sptree.o + $(CXX) $(CFLAGS) $(LDFLAGS) $^ -o $@ sptree.o: sptree.cpp sptree.h - $(CXX) $(CFLAGS) -c sptree.cpp + $(CXX) $(CFLAGS) -c $< tsne.o: tsne.cpp tsne.h sptree.h vptree.h - $(CXX) $(CFLAGS) -c tsne.cpp + $(CXX) $(CFLAGS) -c $< -tsne_3d.o: tsne.cpp tsne.h sptree.h vptree.h - $(CXX) $(CFLAGS) -DTSNE3D -c tsne.cpp +main.o: main.cpp tsne.h + $(CXX) $(CFLAGS) -c $< clean: - rm -Rf *.o bh_tsne bh_tsne_3d + rm -Rf *.o bh_tsne diff --git a/tsne/bh_sne_src/main.cpp b/tsne/bh_sne_src/main.cpp new file mode 100644 index 0000000..6fe7ec5 --- /dev/null +++ b/tsne/bh_sne_src/main.cpp @@ -0,0 +1,142 @@ +/* + * + * Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Delft University of + * Technology. + * 4. Neither the name of the Delft University of Technology nor the names of + * its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include + +#include "tsne.h" + +using namespace std; + +// Function that loads data from a t-SNE file +bool load_data(const char* dat_file, vector* data, int* n, int* d, + int* no_dims, double* theta, double* perplexity, int* rand_seed, + int* max_iter) { + // Open file, read first 2 integers, allocate memory, and read the data + FILE* h; + if ((h = fopen(dat_file, "r+b")) == nullptr) { + fprintf(stderr, "Error: could not open data file.\n"); + return false; + } + fread(n, sizeof(int), 1, h); // number of datapoints + fread(d, sizeof(int), 1, h); // original dimensionality + fread(theta, sizeof(double), 1, h); // gradient accuracy + fread(perplexity, sizeof(double), 1, h); // perplexity + fread(no_dims, sizeof(int), 1, h); // output dimensionality + fread(max_iter, sizeof(int), 1, h); // maximum number of iterations + data->resize(*d * *n); + fread(data->data(), sizeof(double), *n * *d, h); // the data + if (!feof(h)) + fread(rand_seed, sizeof(int), 1, h); // random seed + fclose(h); + fprintf(stderr, "Read the %i x %i data matrix successfully!\n", *n, *d); + return true; +} + +// Function that saves map to a t-SNE file +void save_data(const char* res_file, const vector& data, + const vector& landmarks, const vector& costs, int n, + int d) { + // Open file, write first 2 integers and then the data + FILE* h; + if ((h = fopen(res_file, "w+b")) == nullptr) { + fprintf(stderr, "Error: could not open data file.\n"); + return; + } + fwrite(&n, sizeof(int), 1, h); + fwrite(&d, sizeof(int), 1, h); + fwrite(data.data(), sizeof(double), data.size(), h); + fwrite(landmarks.data(), sizeof(int), landmarks.size(), h); + fwrite(costs.data(), sizeof(double), costs.size(), h); + fclose(h); + fprintf(stderr, "Wrote the %i x %i data matrix successfully!\n", n, d); +} + +void save_csv(const char* csv_file, double* Y, int N, int D) { + std::ofstream csv(csv_file); + + for (int d = 0; d < D; d++) { + csv << "TSNE" << d + 1 << ","; + } + csv << "\n"; + + for (int n = 0; n < N; n++) { + int row_offset = n * D; + for (int d = 0; d < D; d++) { + csv << Y[row_offset + d] << ","; + } + csv << "\n"; + } + + csv.close(); +} + +// Function that runs the Barnes-Hut implementation of t-SNE +int main(int argc, char* argv[]) { + // load input and output + std::string dat_file = "data.dat"; + std::string res_file = "result.dat"; + if (argc > 1) { + dat_file = argv[1]; + res_file = argv[2]; + } + + const char* dat_file_c = dat_file.c_str(); + const char* res_file_c = res_file.c_str(); + + // Define some variables + int origN, N, D, no_dims, max_iter; + double perplexity, theta; + vector data; + int rand_seed = -1; + + // Read the parameters and the dataset + if (load_data(dat_file_c, &data, &origN, &D, &no_dims, &theta, &perplexity, + &rand_seed, &max_iter)) { + // Make dummy landmarks + N = origN; + vector landmarks(N); + for (int n = 0; n < N; n++) + landmarks[n] = n; + + // Now fire up the SNE implementation + vector Y(N * no_dims); + vector costs(N); + run(data.data(), N, D, Y.data(), no_dims, perplexity, theta, rand_seed, + false, nullptr, false, max_iter); + + // Save the results + save_data(res_file_c, Y, landmarks, costs, N, no_dims); + } +} diff --git a/tsne/bh_sne_src/sptree.cpp b/tsne/bh_sne_src/sptree.cpp index 2d187c6..dbad881 100644 --- a/tsne/bh_sne_src/sptree.cpp +++ b/tsne/bh_sne_src/sptree.cpp @@ -12,7 +12,8 @@ * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: - * This product includes software developed by the Delft University of Technology. + * This product includes software developed by the Delft University of + * Technology. * 4. Neither the name of the Delft University of Technology nor the names of * its contributors may be used to endorse or promote products derived from * this software without specific prior written permission. @@ -20,431 +21,378 @@ * THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO - * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY - * OF SUCH DAMAGE. + * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ -#include -#include -#include -#include +#include +#include +#include #include -#include "sptree.h" - +#include +#include "sptree.h" -// Constructs cell -template -Cell::Cell() { -} +#pragma GCC visibility push(hidden) -template -Cell::Cell(double* inp_corner, double* inp_width) { - for(int d = 0; d < NDims; d++) setCorner(d, inp_corner[d]); - for(int d = 0; d < NDims; d++) setWidth( d, inp_width[d]); -} +using std::array; +using std::make_unique; +using std::vector; -// Destructs cell -template -Cell::~Cell() { +template +Cell::Cell(const double* inp_corner, const double* inp_width) { + for (int d = 0; d < NDims; d++) + setCorner(d, inp_corner[d]); + for (int d = 0; d < NDims; d++) + setWidth(d, inp_width[d]); } -template -double Cell::getCorner(unsigned int d) { - return corner[d]; +template +double Cell::getCorner(unsigned int d) const { + return corner[d]; } -template -double Cell::getWidth(unsigned int d) { - return width[d]; +template +double Cell::getWidth(unsigned int d) const { + return width[d]; } -template +template void Cell::setCorner(unsigned int d, double val) { - corner[d] = val; + corner[d] = val; } -template +template void Cell::setWidth(unsigned int d, double val) { - width[d] = val; + width[d] = val; } // Checks whether a point lies in a cell -template -bool Cell::containsPoint(double point[]) -{ - for(int d = 0; d < NDims; d++) { - if(corner[d] - width[d] > point[d]) return false; - if(corner[d] + width[d] < point[d]) return false; - } - return true; +template +bool Cell::containsPoint(const double point[]) const { + assert(point); + for (int d = 0; d < NDims; d++) { + if (corner[d] - width[d] > point[d]) + return false; + if (corner[d] + width[d] < point[d]) + return false; + } + return true; } - // Default constructor for SPTree -- build tree, too! -template -SPTree::SPTree(double* inp_data, unsigned int N) -{ - - // Compute mean, width, and height of current map (boundaries of SPTree) - int nD = 0; - double* mean_Y = (double*) calloc(NDims, sizeof(double)); - double* min_Y = (double*) malloc(NDims * sizeof(double)); - double* max_Y = (double*) malloc(NDims * sizeof(double)); - - for(unsigned int d = 0; d < NDims; d++) { - min_Y[d] = DBL_MAX; - max_Y[d] = -DBL_MAX; +template +SPTree::SPTree(const double* inp_data, unsigned int N) : data(inp_data) { + // Compute mean, width, and height of current map (boundaries of SPTree) + int nD = 0; + array mean_Y; + mean_Y.fill(0.0); + array min_Y; + array max_Y; + min_Y.fill(DBL_MAX); + max_Y.fill(-DBL_MAX); + + for (unsigned int n = 0; n < N; n++) { + for (unsigned int d = 0; d < NDims; d++) { + mean_Y[d] += inp_data[n * NDims + d]; + if (inp_data[nD + d] < min_Y[d]) + min_Y[d] = inp_data[nD + d]; + if (inp_data[nD + d] > max_Y[d]) + max_Y[d] = inp_data[nD + d]; } - - for(unsigned int n = 0; n < N; n++) { - for(unsigned int d = 0; d < NDims; d++) { - mean_Y[d] += inp_data[n * NDims + d]; - if(inp_data[nD + d] < min_Y[d]) min_Y[d] = inp_data[nD + d]; - if(inp_data[nD + d] > max_Y[d]) max_Y[d] = inp_data[nD + d]; - } - nD += NDims; - } - - for(int d = 0; d < NDims; d++) mean_Y[d] /= (double) N; - - // Construct SPTree - double* width = (double*) malloc(NDims * sizeof(double)); - for(int d = 0; d < NDims; d++) width[d] = fmax(max_Y[d] - mean_Y[d], mean_Y[d] - min_Y[d]) + 1e-5; - init(NULL, inp_data, mean_Y, width); - fill(N); - - // Clean up memory - free(mean_Y); - free(max_Y); - free(min_Y); - free(width); -} - - -// Constructor for SPTree with particular size and parent -- build the tree, too! -template -SPTree::SPTree(double* inp_data, unsigned int N, double* inp_corner, double* inp_width) -{ - init(NULL, inp_data, inp_corner, inp_width); - fill(N); + nD += NDims; + } + + for (int d = 0; d < NDims; d++) + mean_Y[d] /= (double)N; + + // Construct SPTree + array width; + for (int d = 0; d < NDims; d++) + width[d] = fmax(max_Y[d] - mean_Y[d], mean_Y[d] - min_Y[d]) + 1e-5; + node.init(mean_Y.data(), width.data()); + fill(N); } - -// Constructor for SPTree with particular size (do not fill the tree) -template -SPTree::SPTree(double* inp_data, double* inp_corner, double* inp_width) -{ - init(NULL, inp_data, inp_corner, inp_width); -} - - -// Constructor for SPTree with particular size and parent (do not fill tree) -template -SPTree::SPTree(SPTree* inp_parent, double* inp_data, double* inp_corner, double* inp_width) { - init(inp_parent, inp_data, inp_corner, inp_width); -} - - -// Constructor for SPTree with particular size and parent -- build the tree, too! -template -SPTree::SPTree(SPTree* inp_parent, double* inp_data, unsigned int N, double* inp_corner, double* inp_width) -{ - init(inp_parent, inp_data, inp_corner, inp_width); - fill(N); -} - - // Main initialization function -template -void SPTree::init(SPTree* inp_parent, double* inp_data, double* inp_corner, double* inp_width) -{ - parent = inp_parent; - data = inp_data; - is_leaf = true; - size = 0; - cum_size = 0; - - for(unsigned int d = 0; d < NDims; d++) boundary.setCorner(d, inp_corner[d]); - for(unsigned int d = 0; d < NDims; d++) boundary.setWidth( d, inp_width[d]); - - for(unsigned int i = 0; i < no_children; i++) children[i] = NULL; - for(unsigned int d = 0; d < NDims; d++) center_of_mass[d] = .0; -} - - -// Destructor for SPTree -template -SPTree::~SPTree() -{ - for(unsigned int i = 0; i < no_children; i++) { - if(children[i] != NULL) delete children[i]; - } -} - - -// Update the data underlying this tree -template -void SPTree::setData(double* inp_data) -{ - data = inp_data; -} - - -// Get the parent of the current tree -template -SPTree* SPTree::getParent() -{ - return parent; +template +void SPTree::Node::init(const double* inp_corner, + const double* inp_width) { + cum_size = 0; + + for (unsigned int d = 0; d < NDims; d++) + boundary.setCorner(d, inp_corner[d]); + for (unsigned int d = 0; d < NDims; d++) + boundary.setWidth(d, inp_width[d]); + + for (unsigned int d = 0; d < NDims; d++) + center_of_mass[d] = .0; } - // Insert a point into the SPTree -template -bool SPTree::insert(unsigned int new_index) -{ - // Ignore objects which do not belong in this quad tree - double* point = data + new_index * NDims; - if(!boundary.containsPoint(point)) - return false; - - // Online update of cumulative size and center-of-mass - cum_size++; - double mult1 = (double) (cum_size - 1) / (double) cum_size; - double mult2 = 1.0 / (double) cum_size; - - for(unsigned int d = 0; d < NDims; d++) { - center_of_mass[d] = center_of_mass[d] * mult1 + mult2 * point[d]; - } - - // If there is space in this quad tree and it is a leaf, add the object here - if(is_leaf && size < QT_NODE_CAPACITY) { - index[size] = new_index; - size++; - return true; - } +template +bool SPTree::Node::insert(const double* data, unsigned int new_index) { + assert(data); + // Ignore objects which do not belong in this quad tree + const double* point = data + new_index * NDims; + if (!boundary.containsPoint(point)) + return false; - // Don't add duplicates for now (this is not very nice) - bool any_duplicate = false; - for(unsigned int n = 0; n < size; n++) { - bool duplicate = true; - for(unsigned int d = 0; d < NDims; d++) { - if(point[d] != data[index[n] * NDims + d]) { duplicate = false; break; } - } - any_duplicate = any_duplicate | duplicate; - } - if(any_duplicate) return true; + // Online update of cumulative size and center-of-mass + ++cum_size; + double mult1 = (double)(cum_size - 1) / (double)cum_size; + double mult2 = 1.0 / (double)cum_size; - // Otherwise, we need to subdivide the current cell - if(is_leaf) subdivide(); + for (unsigned int d = 0; d < NDims; d++) { + center_of_mass[d] = center_of_mass[d] * mult1 + mult2 * point[d]; + } - // Find out where the point can be inserted - for(unsigned int i = 0; i < no_children; i++) { - if(children[i]->insert(new_index)) return true; + // If there is space in this quad tree and it is a leaf, add the object here + if (cum_size == 1) { + index = new_index; + return true; + } + + // Don't add duplicates for now (this is not very nice) + if (!children) { + bool duplicate = true; + for (unsigned int d = 0; d < NDims; d++) { + if (point[d] != data[index * NDims + d]) { + duplicate = false; + break; + } } - - // Otherwise, the point cannot be inserted (this should never happen) - return false; + if (duplicate) + return true; + } + + // Otherwise, we need to subdivide the current cell + if (!children) + subdivide(data); + + // Find out where the point can be inserted + for (auto& child : *children) + if (child.insert(data, new_index)) + return true; + // Otherwise, the point cannot be inserted (this should never happen) + return false; } - -// Create four children which fully divide this cell into four quads of equal area -template -void SPTree::subdivide() { - - // Create new children - double new_corner[NDims]; - double new_width[NDims]; - for(unsigned int i = 0; i < no_children; i++) { - unsigned int div = 1; - for(unsigned int d = 0; d < NDims; d++) { - new_width[d] = .5 * boundary.getWidth(d); - if((i / div) % 2 == 1) new_corner[d] = boundary.getCorner(d) - .5 * boundary.getWidth(d); - else new_corner[d] = boundary.getCorner(d) + .5 * boundary.getWidth(d); - div *= 2; - } - children[i] = new SPTree(this, data, new_corner, new_width); +// Create four children which fully divide this cell into four quads of equal +// area +template +void SPTree::Node::subdivide(const double* data) { + assert(!children); + // Create new children + double new_corner[NDims]; + double new_width[NDims]; + for (unsigned int d = 0; d < NDims; d++) { + new_width[d] = .5 * boundary.getWidth(d); + } + children = make_unique::Node, no_children>>(); + for (unsigned int i = 0; i < no_children; i++) { + unsigned int div = 1; + for (unsigned int d = 0; d < NDims; d++) { + if ((i / div) % 2 == 1) + new_corner[d] = boundary.getCorner(d) - .5 * boundary.getWidth(d); + else + new_corner[d] = boundary.getCorner(d) + .5 * boundary.getWidth(d); + div *= 2; } + (*children)[i].init(new_corner, new_width); + } - // Move existing points to correct children - for(unsigned int i = 0; i < size; i++) { - bool success = false; - for(unsigned int j = 0; j < no_children; j++) { - if(!success) success = children[j]->insert(index[i]); - } - index[i] = -1; + // Move existing points to correct children + for (auto& child : *children) { + if (child.insert(data, index)) { + break; } - - // Empty parent node - size = 0; - is_leaf = false; + } + index = -1; } - // Build SPTree on dataset -template -void SPTree::fill(unsigned int N) -{ - for(unsigned int i = 0; i < N; i++) insert(i); +template +void SPTree::fill(unsigned int N) { + for (unsigned int i = 0; i < N; i++) + node.insert(data, i); } - // Checks whether the specified tree is correct -template -bool SPTree::isCorrect() -{ - for(unsigned int n = 0; n < size; n++) { - double* point = data + index[n] * NDims; - if(!boundary.containsPoint(point)) return false; - } - if(!is_leaf) { - bool correct = true; - for(int i = 0; i < no_children; i++) correct = correct && children[i]->isCorrect(); - return correct; - } - else return true; +template +bool SPTree::isCorrect() const { + return node.isCorrect(data); } - - -// Build a list of all indices in SPTree -template -void SPTree::getAllIndices(unsigned int* indices) -{ - getAllIndices(indices, 0); +template +bool SPTree::Node::isCorrect(const double* data) const { + if (!children && cum_size > 0) { + const double* point = data + index * NDims; + if (!boundary.containsPoint(point)) + return false; + } + if (children) { + for (const auto& child : *children) { + if (!child.isCorrect(data)) { + return false; + } + } + } + return true; } - // Build a list of all indices in SPTree -template -unsigned int SPTree::getAllIndices(unsigned int* indices, unsigned int loc) -{ - - // Gather indices in current quadrant - for(unsigned int i = 0; i < size; i++) indices[loc + i] = index[i]; - loc += size; - - // Gather indices in children - if(!is_leaf) { - for(int i = 0; i < no_children; i++) loc = children[i]->getAllIndices(indices, loc); - } - return loc; +template +void SPTree::getAllIndices(unsigned int* indices) const { + node.getAllIndices(indices, 0); } -template -unsigned int SPTree::getDepth() { - if(is_leaf) return 1; - int depth = 0; - for(unsigned int i = 0; i < no_children; i++) depth = fmax(depth, children[i]->getDepth()); - return 1 + depth; +// Build a list of all indices in SPTree +template +unsigned int SPTree::Node::getAllIndices(unsigned int* indices, + unsigned int loc) const { + // Gather indices in current quadrant + if (!children && cum_size > 0) { + indices[++loc] = index; + } + + // Gather indices in children + if (children) { + for (auto& child : *children) + loc = child.getAllIndices(indices, loc); + } + return loc; } - // Compute non-edge forces using Barnes-Hut algorithm -template -void SPTree::computeNonEdgeForces(unsigned int point_index, double theta, double neg_f[], double* sum_Q) -{ +template +void SPTree::computeNonEdgeForces(unsigned int point_index, double theta, + double neg_f[], double* sum_Q) const { + node.computeNonEdgeForces(data, point_index, theta, neg_f, sum_Q); +} - // Make sure that we spend no time on empty nodes or self-interactions - if(cum_size == 0 || (is_leaf && size == 1 && index[0] == point_index)) return; +template +[[gnu::hot]] void SPTree::Node::computeNonEdgeForces( + const double* data, unsigned int point_index, double theta, double neg_f[], + double* sum_Q) const { + // Make sure that we spend no time on empty nodes or self-interactions + if (cum_size == 0 || (!children && cum_size >= 1 && index == point_index)) + return; + + // Compute distance between point and center-of-mass + double sqdist = .0; + unsigned int ind = point_index * NDims; + + array buff; + for (unsigned int d = 0; d < NDims; d++) { + buff[d] = data[ind + d] - center_of_mass[d]; + sqdist += buff[d] * buff[d]; + } + + // Check whether we can use this node as a "summary" + double max_width = 0.0; + for (unsigned int d = 0; d < NDims; d++) { + double cur_width = boundary.getWidth(d); + max_width = (max_width > cur_width) ? max_width : cur_width; + } + if (!children || max_width * max_width < sqdist * theta * theta) { + // Compute and add t-SNE force between point and current node + sqdist = 1.0 / (1.0 + sqdist); + double mult = cum_size * sqdist; + *sum_Q += mult; + mult *= sqdist; + for (unsigned int d = 0; d < NDims; d++) + neg_f[d] += mult * buff[d]; + } else { + // Recursively apply Barnes-Hut to children + for (const auto& child : *children) + child.computeNonEdgeForces(data, point_index, theta, neg_f, sum_Q); + } +} - // Compute distance between point and center-of-mass - double sqdist = .0; - unsigned int ind = point_index * NDims; +// Computes edge forces +template +vector SPTree::computeEdgeForces(const unsigned int* row_P, + const unsigned int* col_P, + const double* val_P, + int N) const { + return node.computeEdgeForces(data, row_P, col_P, val_P, N); +} - for(unsigned int d = 0; d < NDims; d++) { - buff[d] = data[ind + d] - center_of_mass[d]; +template +vector SPTree::Node::computeEdgeForces(const double* data, + const unsigned int* row_P, + const unsigned int* col_P, + const double* val_P, + int N) const { + vector pos_f(N * NDims); + // Loop over all edges in the graph + unsigned int ind1 = 0; + unsigned int ind2 = 0; + double sqdist; + for (unsigned int n = 0; n < N; n++) { + for (unsigned int i = row_P[n]; i < row_P[n + 1]; i++) { + // Compute pairwise distance and Q-value + sqdist = 1.0; + ind2 = col_P[i] * NDims; + + array buff; + for (unsigned int d = 0; d < NDims; d++) { + buff[d] = data[ind1 + d] - data[ind2 + d]; sqdist += buff[d] * buff[d]; - } - - // Check whether we can use this node as a "summary" - double max_width = 0.0; - double cur_width; - for(unsigned int d = 0; d < NDims; d++) { - cur_width = boundary.getWidth(d); - max_width = (max_width > cur_width) ? max_width : cur_width; - } - if(is_leaf || max_width / sqrt(sqdist) < theta) { - - // Compute and add t-SNE force between point and current node - sqdist = 1.0 / (1.0 + sqdist); - double mult = cum_size * sqdist; - *sum_Q += mult; - mult *= sqdist; - for(unsigned int d = 0; d < NDims; d++) neg_f[d] += mult * buff[d]; - } - else { + } - // Recursively apply Barnes-Hut to children - for(unsigned int i = 0; i < no_children; i++) children[i]->computeNonEdgeForces(point_index, theta, neg_f, sum_Q); - } -} + sqdist = val_P[i] / sqdist; - -// Computes edge forces -template -void SPTree::computeEdgeForces(unsigned int* row_P, unsigned int* col_P, double* val_P, int N, double* pos_f) -{ - - // Loop over all edges in the graph - unsigned int ind1 = 0; - unsigned int ind2 = 0; - double sqdist; - for(unsigned int n = 0; n < N; n++) { - for(unsigned int i = row_P[n]; i < row_P[n + 1]; i++) { - - // Compute pairwise distance and Q-value - sqdist = 1.0; - ind2 = col_P[i] * NDims; - - for(unsigned int d = 0; d < NDims; d++) { - buff[d] = data[ind1 + d] - data[ind2 + d]; - sqdist += buff[d] * buff[d]; - } - - sqdist = val_P[i] / sqdist; - - // Sum positive force - for(unsigned int d = 0; d < NDims; d++) pos_f[ind1 + d] += sqdist * buff[d]; - } - ind1 += NDims; + // Sum positive force + for (unsigned int d = 0; d < NDims; d++) + pos_f[ind1 + d] += sqdist * buff[d]; } + ind1 += NDims; + } + return pos_f; } - // Print out tree -template -void SPTree::print() -{ - if(cum_size == 0) { - fprintf(stderr,"Empty node\n"); - return; - } +template +void SPTree::print() const { + node.print(data); +} - if(is_leaf) { - fprintf(stderr,"Leaf node; data = ["); - for(int i = 0; i < size; i++) { - double* point = data + index[i] * NDims; - for(int d = 0; d < NDims; d++) fprintf(stderr,"%f, ", point[d]); - fprintf(stderr," (index = %d)", index[i]); - if(i < size - 1) fprintf(stderr,"\n"); - else fprintf(stderr,"]\n"); - } - } - else { - fprintf(stderr,"Intersection node with center-of-mass = ["); - for(int d = 0; d < NDims; d++) fprintf(stderr,"%f, ", center_of_mass[d]); - fprintf(stderr,"]; children are:\n"); - for(int i = 0; i < no_children; i++) children[i]->print(); +template +void SPTree::Node::print(const double* data) const { + if (cum_size == 0) { + fprintf(stderr, "Empty node\n"); + return; + } + + if (!children) { + fprintf(stderr, "Leaf node; data = ["); + if (!children && cum_size > 0) { + const double* point = data + index * NDims; + for (int d = 0; d < NDims; d++) + fprintf(stderr, "%f, ", point[d]); + fprintf(stderr, " (index = %d)", index); } + fprintf(stderr, "]\n"); + } else { + fprintf(stderr, "Intersection node with center-of-mass = ["); + for (int d = 0; d < NDims; d++) + fprintf(stderr, "%f, ", center_of_mass[d]); + fprintf(stderr, "]; children are:\n"); + for (const auto& child : *children) + child.print(data); + } } // declare templates explicitly template class SPTree<2>; template class SPTree<3>; + +#pragma GCC visibility pop diff --git a/tsne/bh_sne_src/sptree.h b/tsne/bh_sne_src/sptree.h index be42686..e399b60 100644 --- a/tsne/bh_sne_src/sptree.h +++ b/tsne/bh_sne_src/sptree.h @@ -12,7 +12,8 @@ * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: - * This product includes software developed by the Delft University of Technology. + * This product includes software developed by the Delft University of + * Technology. * 4. Neither the name of the Delft University of Technology nor the names of * its contributors may be used to endorse or promote products derived from * this software without specific prior written permission. @@ -20,103 +21,110 @@ * THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO - * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY - * OF SUCH DAMAGE. + * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ - #ifndef SPTREE_H #define SPTREE_H -using namespace std; - +#include +#include +#include -template -class Cell { - double corner[NDims]; - double width[NDims]; +#pragma GCC visibility push(hidden) +template +class alignas(16) Cell { + std::array corner; + std::array width; -public: - Cell(); - Cell(double* inp_corner, double* inp_width); - ~Cell(); + public: + Cell() = default; + Cell(const double* inp_corner, const double* inp_width); + ~Cell() = default; - double getCorner(unsigned int d); - double getWidth(unsigned int d); - void setCorner(unsigned int d, double val); - void setWidth(unsigned int d, double val); - bool containsPoint(double point[]); + double getCorner(unsigned int d) const; + double getWidth(unsigned int d) const; + void setCorner(unsigned int d, double val); + void setWidth(unsigned int d, double val); + bool containsPoint(const double point[]) const; }; -template -class SPTree -{ -public: - enum { no_children = 2 * SPTree::no_children }; - -private: - // Fixed constants - static const unsigned int QT_NODE_CAPACITY = 1; +template +class SPTree { + public: + enum { no_children = 2 * SPTree::no_children }; + + private: + class alignas(16) Node { + public: + bool isCorrect(const double* data) const; + bool insert(const double* data, unsigned int new_index); + void subdivide(const double* data); + void print(const double* data) const; + void computeNonEdgeForces(const double* data, unsigned int point_index, + double theta, double neg_f[], + double* sum_Q) const; + std::vector computeEdgeForces(const double* data, + const unsigned int* row_P, + const unsigned int* col_P, + const double* val_P, int N) const; + unsigned int getAllIndices(unsigned int* indices, unsigned int loc) const; + void init(const double* inp_corner, const double* inp_width); + + private: + std::array center_of_mass; + + // Axis-aligned bounding box stored as a center with half-dimensions to + // represent the boundaries of this quad tree + Cell boundary; - // A buffer we use when doing force computations - double buff[NDims]; + // Children + std::unique_ptr::Node, no_children>> children; // Properties of this node in the tree - SPTree* parent; - unsigned int dimension; - bool is_leaf; - unsigned int size; + unsigned int index; unsigned int cum_size; - - // Axis-aligned bounding box stored as a center with half-dimensions to represent the boundaries of this quad tree - Cell boundary; - - // Indices in this space-partitioning tree node, corresponding center-of-mass, and list of all children - double* data; - double center_of_mass[NDims]; - unsigned int index[QT_NODE_CAPACITY]; - - // Children - SPTree* children[no_children]; - -public: - SPTree(double* inp_data, unsigned int N); - SPTree(double* inp_data, double* inp_corner, double* inp_width); - SPTree(double* inp_data, unsigned int N, double* inp_corner, double* inp_width); - SPTree(SPTree* inp_parent, double* inp_data, unsigned int N, double* inp_corner, double* inp_width); - SPTree(SPTree* inp_parent, double* inp_data, double* inp_corner, double* inp_width); - ~SPTree(); - void setData(double* inp_data); - SPTree* getParent(); - void construct(Cell boundary); - bool insert(unsigned int new_index); - void subdivide(); - bool isCorrect(); - void rebuildTree(); - void getAllIndices(unsigned int* indices); - unsigned int getDepth(); - void computeNonEdgeForces(unsigned int point_index, double theta, double neg_f[], double* sum_Q); - void computeEdgeForces(unsigned int* row_P, unsigned int* col_P, double* val_P, int N, double* pos_f); - void print(); - -private: - void init(SPTree* inp_parent, double* inp_data, double* inp_corner, double* inp_width); - void fill(unsigned int N); - unsigned int getAllIndices(unsigned int* indices, unsigned int loc); - bool isChild(unsigned int test_index, unsigned int start, unsigned int end); + }; + + // Indices in this space-partitioning tree node, corresponding center-of-mass, + // and list of all children + const double* data; + + Node node; + + SPTree(const SPTree&) = delete; + SPTree& operator=(const SPTree&) = delete; + + public: + SPTree() = default; + SPTree(SPTree&&) = default; + SPTree(const double* inp_data, unsigned int N); + ~SPTree() = default; + bool isCorrect() const; + void getAllIndices(unsigned int* indices) const; + void computeNonEdgeForces(unsigned int point_index, double theta, + double neg_f[], double* sum_Q) const; + std::vector computeEdgeForces(const unsigned int* row_P, + const unsigned int* col_P, + const double* val_P, int N) const; + void print() const; + + private: + void fill(unsigned int N); }; template <> -struct SPTree<0> -{ - enum { no_children = 1 }; +struct SPTree<0> { + enum { no_children = 1 }; }; +#pragma GCC visibility pop #endif diff --git a/tsne/bh_sne_src/tsne.cpp b/tsne/bh_sne_src/tsne.cpp index 5fac463..da43130 100644 --- a/tsne/bh_sne_src/tsne.cpp +++ b/tsne/bh_sne_src/tsne.cpp @@ -12,7 +12,8 @@ * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: - * This product includes software developed by the Delft University of Technology. + * This product includes software developed by the Delft University of + * Technology. * 4. Neither the name of the Delft University of Technology nor the names of * its contributors may be used to endorse or promote products derived from * this software without specific prior written permission. @@ -20,771 +21,780 @@ * THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO - * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY - * OF SUCH DAMAGE. + * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include #include -#include -#include -#include -#include -#include -#include -#include "vptree.h" + #include "sptree.h" #include "tsne.h" +#include "vptree.h" -using namespace std; - -#ifdef TSNE3D -#define NDIMS 3 -#else -#define NDIMS 2 -#endif - -// Perform t-SNE -void TSNE::run(double* X, int N, int D, double* Y, int no_dims, double perplexity, double theta, int rand_seed, - bool skip_random_init, double *init, bool use_init, - int max_iter, int stop_lying_iter, int mom_switch_iter - ) { - - // Set random seed - if (skip_random_init != true) { - if(rand_seed >= 0) { - fprintf(stderr,"Using random seed: %d\n", rand_seed); - srand((unsigned int) rand_seed); - } else { - fprintf(stderr,"Using current time as random seed...\n"); - srand(time(NULL)); - } - } - - // Determine whether we are using an exact algorithm - if(N - 1 < 3 * perplexity) { fprintf(stderr,"Perplexity too large for the number of data points!\n"); exit(1); } - fprintf(stderr,"Using no_dims = %d, perplexity = %f, and theta = %f\n", no_dims, perplexity, theta); - bool exact = (theta == .0) ? true : false; - - fprintf(stderr,"Using max_iter = %d, stop_lying_iter = %d, mom_switch_iter = %d\n", max_iter, stop_lying_iter, mom_switch_iter); - - // Set learning parameters - float total_time = .0; - clock_t start, end; - double momentum = .5, final_momentum = .8; - double eta = 200.0; - - // Allocate some memory - double* dY = (double*) malloc(N * no_dims * sizeof(double)); - double* uY = (double*) malloc(N * no_dims * sizeof(double)); - double* gains = (double*) malloc(N * no_dims * sizeof(double)); - if(dY == NULL || uY == NULL || gains == NULL) { fprintf(stderr,"Memory allocation failed!\n"); exit(1); } - for(int i = 0; i < N * no_dims; i++) uY[i] = .0; - for(int i = 0; i < N * no_dims; i++) gains[i] = 1.0; - - // Normalize input data (to prevent numerical problems) - fprintf(stderr,"Computing input similarities...\n"); - start = clock(); - zeroMean(X, N, D); - double max_X = .0; - for(int i = 0; i < N * D; i++) { - if(fabs(X[i]) > max_X) max_X = fabs(X[i]); - } - for(int i = 0; i < N * D; i++) X[i] /= max_X; - - // Compute input similarities for exact t-SNE - double* P; unsigned int* row_P; unsigned int* col_P; double* val_P; - if(exact) { - - // Compute similarities - fprintf(stderr,"Exact?"); - P = (double*) malloc(N * N * sizeof(double)); - if(P == NULL) { fprintf(stderr,"Memory allocation failed!\n"); exit(1); } - computeGaussianPerplexity(X, N, D, P, perplexity); - - // Symmetrize input similarities - fprintf(stderr,"Symmetrizing...\n"); - int nN = 0; - for(int n = 0; n < N; n++) { - int mN = (n + 1) * N; - for(int m = n + 1; m < N; m++) { - P[nN + m] += P[mN + n]; - P[mN + n] = P[nN + m]; - mN += N; - } - nN += N; - } - double sum_P = .0; - for(int i = 0; i < N * N; i++) sum_P += P[i]; - for(int i = 0; i < N * N; i++) P[i] /= sum_P; - } - - // Compute input similarities for approximate t-SNE - else { - - // Compute asymmetric pairwise input similarities - computeGaussianPerplexity(X, N, D, &row_P, &col_P, &val_P, perplexity, (int) (3 * perplexity)); - - // Symmetrize input similarities - symmetrizeMatrix(&row_P, &col_P, &val_P, N); - double sum_P = .0; - for(int i = 0; i < row_P[N]; i++) sum_P += val_P[i]; - for(int i = 0; i < row_P[N]; i++) val_P[i] /= sum_P; - } - end = clock(); - - // Lie about the P-values - if(exact) { for(int i = 0; i < N * N; i++) P[i] *= 12.0; } - else { for(int i = 0; i < row_P[N]; i++) val_P[i] *= 12.0; } - - // Initialize solution (randomly or with given coordinates) - if (!use_init && !skip_random_init) { - for(int i = 0; i < N * no_dims; i++) { - Y[i] = randn() * .0001; - } - } else if (use_init) { - for(int i = 0; i < N * no_dims; i++) { - Y[i] = init[i]; - } - } - - // Perform main training loop - if(exact) fprintf(stderr,"Input similarities computed in %4.2f seconds!\nLearning embedding...\n", (float) (end - start) / CLOCKS_PER_SEC); - else fprintf(stderr,"Input similarities computed in %4.2f seconds (sparsity = %f)!\nLearning embedding...\n", (float) (end - start) / CLOCKS_PER_SEC, (double) row_P[N] / ((double) N * (double) N)); - start = clock(); - - for(int iter = 0; iter < max_iter; iter++) { - - // Compute (approximate) gradient - if(exact) computeExactGradient(P, Y, N, no_dims, dY); - else computeGradient(P, row_P, col_P, val_P, Y, N, no_dims, dY, theta); - - // Update gains - for(int i = 0; i < N * no_dims; i++) gains[i] = (sign(dY[i]) != sign(uY[i])) ? (gains[i] + .2) : (gains[i] * .8); - for(int i = 0; i < N * no_dims; i++) if(gains[i] < .01) gains[i] = .01; - - // Perform gradient update (with momentum and gains) - for(int i = 0; i < N * no_dims; i++) uY[i] = momentum * uY[i] - eta * gains[i] * dY[i]; - for(int i = 0; i < N * no_dims; i++) Y[i] = Y[i] + uY[i]; - - // Make solution zero-mean - zeroMean(Y, N, no_dims); - - // Stop lying about the P-values after a while, and switch momentum - if(iter == stop_lying_iter) { - if(exact) { for(int i = 0; i < N * N; i++) P[i] /= 12.0; } - else { for(int i = 0; i < row_P[N]; i++) val_P[i] /= 12.0; } - } - if(iter == mom_switch_iter) momentum = final_momentum; - - // Print out progress - if (iter > 0 && (iter % 50 == 0 || iter == max_iter - 1)) { - end = clock(); - double C = .0; - if(exact) C = evaluateError(P, Y, N, no_dims); - else C = evaluateError(row_P, col_P, val_P, Y, N, no_dims, theta); // doing approximate computation here! - if(iter == 0) - fprintf(stderr,"Iteration %d: error is %f\n", iter + 1, C); - else { - total_time += (float) (end - start) / CLOCKS_PER_SEC; - fprintf(stderr,"Iteration %d: error is %f (50 iterations in %4.2f seconds)\n", iter, C, (float) (end - start) / CLOCKS_PER_SEC); - } - start = clock(); - } - } - end = clock(); total_time += (float) (end - start) / CLOCKS_PER_SEC; - - // Clean up memory - free(dY); - free(uY); - free(gains); - if(exact) free(P); - else { - free(row_P); row_P = NULL; - free(col_P); col_P = NULL; - free(val_P); val_P = NULL; - } - fprintf(stderr,"Fitting performed in %4.2f seconds.\n", total_time); +#pragma GCC visibility push(hidden) + +using std::abs; +using std::array; +using std::exp; +using std::fprintf; +using std::log; +using std::move; +using std::sqrt; +using std::vector; + +namespace { + +template +void run(double* X, int N, int D, double* Y, double perplexity, double theta, + int rand_seed, bool skip_random_init, double* init, bool use_init, + int max_iter, int stop_lying_iter, int mom_switch_iter); + +template +void computeGradient(const vector& inp_row_P, + const vector& inp_col_P, + const vector& inp_val_P, double* Y, int N, + vector& dC, double theta); +template +void computeExactGradient(const vector& P, double* Y, int N, + double* dC); +template +double evaluateError(const vector& P, double* Y, int N); +template +double evaluateError(const vector& row_P, + const vector& col_P, + const vector& val_P, double* Y, int N, + double theta); +void zeroMean(double* X, int N, int D); +template +void zeroMean(double* X, int N); +void computeGaussianPerplexity(double* X, int N, int D, double* P, + double perplexity); +void computeGaussianPerplexity(double* X, int N, int D, + vector* _row_P, + vector* _col_P, + vector* _val_P, double perplexity, + int K); +vector computeSquaredEuclideanDistance(double* X, int N, int D); +double randn(); +void symmetrizeMatrix(vector* row_P, vector* col_P, + vector* val_P, int N); + +static inline double sign(double x) { + return (x == .0 ? .0 : (x < .0 ? -1.0 : 1.0)); } - // Compute gradient of the t-SNE cost function (using Barnes-Hut algorithm) -void TSNE::computeGradient(double* P, unsigned int* inp_row_P, unsigned int* inp_col_P, double* inp_val_P, double* Y, int N, int D, double* dC, double theta) -{ - - // Construct space-partitioning tree on current map - SPTree* tree = new SPTree(Y, N); - - // Compute all terms required for t-SNE gradient - double sum_Q = .0; - double* pos_f = (double*) calloc(N * D, sizeof(double)); - double* neg_f = (double*) calloc(N * D, sizeof(double)); - if(pos_f == NULL || neg_f == NULL) { fprintf(stderr,"Memory allocation failed!\n"); exit(1); } - tree->computeEdgeForces(inp_row_P, inp_col_P, inp_val_P, N, pos_f); - for(int n = 0; n < N; n++) tree->computeNonEdgeForces(n, theta, neg_f + n * D, &sum_Q); - - // Compute final t-SNE gradient - for(int i = 0; i < N * D; i++) { - dC[i] = pos_f[i] - (neg_f[i] / sum_Q); - } - free(pos_f); - free(neg_f); - delete tree; +template +void computeGradient(const vector& inp_row_P, + const vector& inp_col_P, + const vector& inp_val_P, double* Y, int N, + vector& dC, double theta) { + // Construct space-partitioning tree on current map + SPTree tree(Y, N); + + // Compute all terms required for t-SNE gradient + double sum_Q = .0; + vector neg_f(N * NDIMS); + auto pos_f = tree.computeEdgeForces(inp_row_P.data(), inp_col_P.data(), + inp_val_P.data(), N); + for (int n = 0; n < N; n++) + tree.computeNonEdgeForces(n, theta, neg_f.data() + n * NDIMS, &sum_Q); + + // Compute final t-SNE gradient + for (int i = 0; i < N * NDIMS; i++) { + dC[i] = pos_f[i] - (neg_f[i] / sum_Q); + } } // Compute gradient of the t-SNE cost function (exact) -void TSNE::computeExactGradient(double* P, double* Y, int N, int D, double* dC) { - - // Make sure the current gradient contains zeros - for(int i = 0; i < N * D; i++) dC[i] = 0.0; - - // Compute the squared Euclidean distance matrix - double* DD = (double*) malloc(N * N * sizeof(double)); - if(DD == NULL) { fprintf(stderr,"Memory allocation failed!\n"); exit(1); } - computeSquaredEuclideanDistance(Y, N, D, DD); - - // Compute Q-matrix and normalization sum - double* Q = (double*) malloc(N * N * sizeof(double)); - if(Q == NULL) { fprintf(stderr,"Memory allocation failed!\n"); exit(1); } - double sum_Q = .0; - int nN = 0; - for(int n = 0; n < N; n++) { - for(int m = 0; m < N; m++) { - if(n != m) { - Q[nN + m] = 1 / (1 + DD[nN + m]); - sum_Q += Q[nN + m]; - } +template +void computeExactGradient(const vector& P, double* Y, int N, + vector& dC) { + // Make sure the current gradient contains zeros + dC.assign(N * D, 0.0); + + // Compute the squared Euclidean distance matrix + vector DD = computeSquaredEuclideanDistance(Y, N, D); + + // Compute Q-matrix and normalization sum + vector Q(N * N); + double sum_Q = .0; + int nN = 0; + for (int n = 0; n < N; n++) { + for (int m = 0; m < N; m++) { + if (n != m) { + Q[nN + m] = 1 / (1 + DD[nN + m]); + sum_Q += Q[nN + m]; + } + } + nN += N; + } + + // Perform the computation of the gradient + nN = 0; + int nD = 0; + for (int n = 0; n < N; n++) { + int mD = 0; + for (int m = 0; m < N; m++) { + if (n != m) { + double mult = (P[nN + m] - (Q[nN + m] / sum_Q)) * Q[nN + m]; + for (int d = 0; d < D; d++) { + dC[nD + d] += (Y[nD + d] - Y[mD + d]) * mult; } - nN += N; + } + mD += D; } - - // Perform the computation of the gradient - nN = 0; - int nD = 0; - for(int n = 0; n < N; n++) { - int mD = 0; - for(int m = 0; m < N; m++) { - if(n != m) { - double mult = (P[nN + m] - (Q[nN + m] / sum_Q)) * Q[nN + m]; - for(int d = 0; d < D; d++) { - dC[nD + d] += (Y[nD + d] - Y[mD + d]) * mult; - } - } - mD += D; - } - nN += N; - nD += D; - } - - // Free memory - free(DD); DD = NULL; - free(Q); Q = NULL; + nN += N; + nD += D; + } } - // Evaluate t-SNE cost function (exactly) -double TSNE::evaluateError(double* P, double* Y, int N, int D) { - - // Compute the squared Euclidean distance matrix - double* DD = (double*) malloc(N * N * sizeof(double)); - double* Q = (double*) malloc(N * N * sizeof(double)); - if(DD == NULL || Q == NULL) { fprintf(stderr,"Memory allocation failed!\n"); exit(1); } - computeSquaredEuclideanDistance(Y, N, D, DD); - - // Compute Q-matrix and normalization sum - int nN = 0; - double sum_Q = DBL_MIN; - for(int n = 0; n < N; n++) { - for(int m = 0; m < N; m++) { - if(n != m) { - Q[nN + m] = 1 / (1 + DD[nN + m]); - sum_Q += Q[nN + m]; - } - else Q[nN + m] = DBL_MIN; - } - nN += N; - } - for(int i = 0; i < N * N; i++) Q[i] /= sum_Q; - - // Sum t-SNE error - double C = .0; - for(int n = 0; n < N * N; n++) { - C += P[n] * log((P[n] + FLT_MIN) / (Q[n] + FLT_MIN)); +template +double evaluateError(const vector& P, double* Y, int N) { + // Compute the squared Euclidean distance matrix + vector DD = computeSquaredEuclideanDistance(Y, N, D); + vector Q(N * N); + + // Compute Q-matrix and normalization sum + int nN = 0; + double sum_Q = DBL_MIN; + for (int n = 0; n < N; n++) { + for (int m = 0; m < N; m++) { + if (n != m) { + Q[nN + m] = 1 / (1 + DD[nN + m]); + sum_Q += Q[nN + m]; + } else + Q[nN + m] = DBL_MIN; } - - // Clean up memory - free(DD); - free(Q); - return C; + nN += N; + } + for (int i = 0; i < N * N; i++) + Q[i] /= sum_Q; + + // Sum t-SNE error + double C = .0; + for (int n = 0; n < N * N; n++) { + C += P[n] * log((P[n] + FLT_MIN) / (Q[n] + FLT_MIN)); + } + return C; } // Evaluate t-SNE cost function (approximately) -double TSNE::evaluateError(unsigned int* row_P, unsigned int* col_P, double* val_P, double* Y, int N, int D, double theta) -{ - - // Get estimate of normalization term - SPTree* tree = new SPTree(Y, N); - double* buff = (double*) calloc(D, sizeof(double)); - double sum_Q = .0; - for(int n = 0; n < N; n++) tree->computeNonEdgeForces(n, theta, buff, &sum_Q); - - // Loop over all edges to compute t-SNE error - int ind1, ind2; - double C = .0, Q; - for(int n = 0; n < N; n++) { - ind1 = n * D; - for(int i = row_P[n]; i < row_P[n + 1]; i++) { - Q = .0; - ind2 = col_P[i] * D; - for(int d = 0; d < D; d++) buff[d] = Y[ind1 + d]; - for(int d = 0; d < D; d++) buff[d] -= Y[ind2 + d]; - for(int d = 0; d < D; d++) Q += buff[d] * buff[d]; - Q = (1.0 / (1.0 + Q)) / sum_Q; - C += val_P[i] * log((val_P[i] + FLT_MIN) / (Q + FLT_MIN)); - } +template +double evaluateError(const vector& row_P, + const vector& col_P, + const vector& val_P, double* Y, int N, + double theta) { + // Get estimate of normalization term + array buff; + buff.fill(0); + double sum_Q = .0; + { + SPTree tree(Y, N); + for (int n = 0; n < N; n++) + tree.computeNonEdgeForces(n, theta, buff.data(), &sum_Q); + } + + // Loop over all edges to compute t-SNE error + int ind1, ind2; + double C = .0, Q; + for (int n = 0; n < N; n++) { + ind1 = n * NDIMS; + for (int i = row_P[n]; i < row_P[n + 1]; i++) { + Q = .0; + ind2 = col_P[i] * NDIMS; + for (int d = 0; d < NDIMS; d++) + buff[d] = Y[ind1 + d]; + for (int d = 0; d < NDIMS; d++) + buff[d] -= Y[ind2 + d]; + for (int d = 0; d < NDIMS; d++) + Q += buff[d] * buff[d]; + Q = (1.0 / (1.0 + Q)) / sum_Q; + C += val_P[i] * log((val_P[i] + FLT_MIN) / (Q + FLT_MIN)); } + } - // Clean up memory - free(buff); - delete tree; - return C; + // Clean up memory + return C; } - // Compute input similarities with a fixed perplexity -void TSNE::computeGaussianPerplexity(double* X, int N, int D, double* P, double perplexity) { +void computeGaussianPerplexity(double* X, int N, int D, double* P, + double perplexity) { + // Compute the squared Euclidean distance matrix + vector DD = computeSquaredEuclideanDistance(X, N, D); + + // Compute the Gaussian kernel row by row + int nN = 0; + for (int n = 0; n < N; n++) { + // Initialize some variables + bool found = false; + double beta = 1.0; + double min_beta = -DBL_MAX; + double max_beta = DBL_MAX; + double tol = 1e-5; + double sum_P; + + // Iterate until we found a good perplexity + int iter = 0; + while (!found && iter < 200) { + // Compute Gaussian kernel row + for (int m = 0; m < N; m++) + P[nN + m] = exp(-beta * DD[nN + m]); + P[nN + n] = DBL_MIN; + + // Compute entropy of current row + sum_P = DBL_MIN; + for (int m = 0; m < N; m++) + sum_P += P[nN + m]; + double H = 0.0; + for (int m = 0; m < N; m++) + H += beta * (DD[nN + m] * P[nN + m]); + H = (H / sum_P) + log(sum_P); + + // Evaluate whether the entropy is within the tolerance level + double Hdiff = H - log(perplexity); + if (Hdiff < tol && -Hdiff < tol) { + found = true; + } else { + if (Hdiff > 0) { + min_beta = beta; + if (max_beta == DBL_MAX || max_beta == -DBL_MAX) + beta *= 2.0; + else + beta = (beta + max_beta) / 2.0; + } else { + max_beta = beta; + if (min_beta == -DBL_MAX || min_beta == DBL_MAX) + beta /= 2.0; + else + beta = (beta + min_beta) / 2.0; + } + } - // Compute the squared Euclidean distance matrix - double* DD = (double*) malloc(N * N * sizeof(double)); - if(DD == NULL) { fprintf(stderr,"Memory allocation failed!\n"); exit(1); } - computeSquaredEuclideanDistance(X, N, D, DD); + // Update iteration counter + iter++; + } - // Compute the Gaussian kernel row by row - int nN = 0; - for(int n = 0; n < N; n++) { - - // Initialize some variables - bool found = false; - double beta = 1.0; - double min_beta = -DBL_MAX; - double max_beta = DBL_MAX; - double tol = 1e-5; - double sum_P; - - // Iterate until we found a good perplexity - int iter = 0; - while(!found && iter < 200) { - - // Compute Gaussian kernel row - for(int m = 0; m < N; m++) P[nN + m] = exp(-beta * DD[nN + m]); - P[nN + n] = DBL_MIN; - - // Compute entropy of current row - sum_P = DBL_MIN; - for(int m = 0; m < N; m++) sum_P += P[nN + m]; - double H = 0.0; - for(int m = 0; m < N; m++) H += beta * (DD[nN + m] * P[nN + m]); - H = (H / sum_P) + log(sum_P); - - // Evaluate whether the entropy is within the tolerance level - double Hdiff = H - log(perplexity); - if(Hdiff < tol && -Hdiff < tol) { - found = true; - } - else { - if(Hdiff > 0) { - min_beta = beta; - if(max_beta == DBL_MAX || max_beta == -DBL_MAX) - beta *= 2.0; - else - beta = (beta + max_beta) / 2.0; - } - else { - max_beta = beta; - if(min_beta == -DBL_MAX || min_beta == DBL_MAX) - beta /= 2.0; - else - beta = (beta + min_beta) / 2.0; - } - } - - // Update iteration counter - iter++; - } - - // Row normalize P - for(int m = 0; m < N; m++) P[nN + m] /= sum_P; - nN += N; - } - - // Clean up memory - free(DD); DD = NULL; + // Row normalize P + for (int m = 0; m < N; m++) + P[nN + m] /= sum_P; + nN += N; + } } - -// Compute input similarities with a fixed perplexity using ball trees (this function allocates memory another function should free) -void TSNE::computeGaussianPerplexity(double* X, int N, int D, unsigned int** _row_P, unsigned int** _col_P, double** _val_P, double perplexity, int K) { - - if(perplexity > K) fprintf(stderr,"Perplexity should be lower than K!\n"); - - // Allocate the memory we need - *_row_P = (unsigned int*) malloc((N + 1) * sizeof(unsigned int)); - *_col_P = (unsigned int*) calloc(N * K, sizeof(unsigned int)); - *_val_P = (double*) calloc(N * K, sizeof(double)); - if(*_row_P == NULL || *_col_P == NULL || *_val_P == NULL) { fprintf(stderr,"Memory allocation failed!\n"); exit(1); } - unsigned int* row_P = *_row_P; - unsigned int* col_P = *_col_P; - double* val_P = *_val_P; - double* cur_P = (double*) malloc((N - 1) * sizeof(double)); - if(cur_P == NULL) { fprintf(stderr,"Memory allocation failed!\n"); exit(1); } - row_P[0] = 0; - for(int n = 0; n < N; n++) row_P[n + 1] = row_P[n] + (unsigned int) K; - - // Build ball tree on data set - VpTree* tree = new VpTree(); - vector obj_X(N, DataPoint(D, -1, X)); - for(int n = 0; n < N; n++) obj_X[n] = DataPoint(D, n, X + n * D); - tree->create(obj_X); - - // Loop over all points to find nearest neighbors - fprintf(stderr,"Building tree...\n"); - vector indices; - vector distances; - for(int n = 0; n < N; n++) { - - if(n % 10000 == 0) fprintf(stderr," - point %d of %d\n", n, N); - - // Find nearest neighbors - indices.clear(); - distances.clear(); - tree->search(obj_X[n], K + 1, &indices, &distances); - - // Initialize some variables for binary search - bool found = false; - double beta = 1.0; - double min_beta = -DBL_MAX; - double max_beta = DBL_MAX; - double tol = 1e-5; - - // Iterate until we found a good perplexity - int iter = 0; double sum_P; - while(!found && iter < 200) { - - // Compute Gaussian kernel row - for(int m = 0; m < K; m++) cur_P[m] = exp(-beta * distances[m + 1] * distances[m + 1]); - - // Compute entropy of current row - sum_P = DBL_MIN; - for(int m = 0; m < K; m++) sum_P += cur_P[m]; - double H = .0; - for(int m = 0; m < K; m++) H += beta * (distances[m + 1] * distances[m + 1] * cur_P[m]); - H = (H / sum_P) + log(sum_P); - - // Evaluate whether the entropy is within the tolerance level - double Hdiff = H - log(perplexity); - if(Hdiff < tol && -Hdiff < tol) { - found = true; - } - else { - if(Hdiff > 0) { - min_beta = beta; - if(max_beta == DBL_MAX || max_beta == -DBL_MAX) - beta *= 2.0; - else - beta = (beta + max_beta) / 2.0; - } - else { - max_beta = beta; - if(min_beta == -DBL_MAX || min_beta == DBL_MAX) - beta /= 2.0; - else - beta = (beta + min_beta) / 2.0; - } - } - - // Update iteration counter - iter++; +// Compute input similarities with a fixed perplexity using ball trees. +void computeGaussianPerplexity(double* X, int N, int D, + vector* _row_P, + vector* _col_P, + vector* _val_P, double perplexity, + int K) { + if (perplexity > K) + fprintf(stderr, "Perplexity should be lower than K!\n"); + + // Allocate the memory we need + _row_P->resize(N + 1); + _col_P->resize(N * K); + _val_P->resize(N * K); + vector& row_P = *_row_P; + vector& col_P = *_col_P; + vector& val_P = *_val_P; + vector cur_P(N - 1); + row_P[0] = 0; + for (int n = 0; n < N; n++) + row_P[n + 1] = row_P[n] + (unsigned int)K; + + // Build ball tree on data set + VpTree tree((euclidean_distance(D, X))); + vector obj_X(N); + for (int n = 0; n < N; n++) + obj_X[n] = n; + tree.create(obj_X); + + // Loop over all points to find nearest neighbors + fprintf(stderr, "Building tree...\n"); + vector indices; + vector distances; + indices.reserve(N); + distances.reserve(N); + for (int n = 0; n < N; n++) { + if (n % 10000 == 0) + fprintf(stderr, " - point %d of %d\n", n, N); + + // Find nearest neighbors + indices.clear(); + distances.clear(); + tree.search(obj_X[n], K + 1, &indices, &distances); + + // Initialize some variables for binary search + bool found = false; + double beta = 1.0; + double min_beta = -DBL_MAX; + double max_beta = DBL_MAX; + double tol = 1e-5; + + // Iterate until we found a good perplexity + int iter = 0; + double sum_P; + while (!found && iter < 200) { + // Compute Gaussian kernel row + for (int m = 0; m < K; m++) + cur_P[m] = exp(-beta * distances[m + 1] * distances[m + 1]); + + // Compute entropy of current row + sum_P = DBL_MIN; + for (int m = 0; m < K; m++) + sum_P += cur_P[m]; + double H = .0; + for (int m = 0; m < K; m++) + H += beta * (distances[m + 1] * distances[m + 1] * cur_P[m]); + H = (H / sum_P) + log(sum_P); + + // Evaluate whether the entropy is within the tolerance level + double Hdiff = H - log(perplexity); + if (Hdiff < tol && -Hdiff < tol) { + found = true; + } else { + if (Hdiff > 0) { + min_beta = beta; + if (max_beta == DBL_MAX || max_beta == -DBL_MAX) + beta *= 2.0; + else + beta = (beta + max_beta) / 2.0; + } else { + max_beta = beta; + if (min_beta == -DBL_MAX || min_beta == DBL_MAX) + beta /= 2.0; + else + beta = (beta + min_beta) / 2.0; } + } - // Row-normalize current row of P and store in matrix - for(unsigned int m = 0; m < K; m++) cur_P[m] /= sum_P; - for(unsigned int m = 0; m < K; m++) { - col_P[row_P[n] + m] = (unsigned int) indices[m + 1].index(); - val_P[row_P[n] + m] = cur_P[m]; - } + // Update iteration counter + iter++; } - // Clean up memory - obj_X.clear(); - free(cur_P); - delete tree; + // Row-normalize current row of P and store in matrix + for (unsigned int m = 0; m < K; m++) + cur_P[m] /= sum_P; + for (unsigned int m = 0; m < K; m++) { + col_P[row_P[n] + m] = indices[m + 1]; + val_P[row_P[n] + m] = cur_P[m]; + } + } } - // Symmetrizes a sparse matrix -void TSNE::symmetrizeMatrix(unsigned int** _row_P, unsigned int** _col_P, double** _val_P, int N) { - - // Get sparse matrix - unsigned int* row_P = *_row_P; - unsigned int* col_P = *_col_P; - double* val_P = *_val_P; - - // Count number of elements and row counts of symmetric matrix - int* row_counts = (int*) calloc(N, sizeof(int)); - if(row_counts == NULL) { fprintf(stderr,"Memory allocation failed!\n"); exit(1); } - for(int n = 0; n < N; n++) { - for(int i = row_P[n]; i < row_P[n + 1]; i++) { - - // Check whether element (col_P[i], n) is present - bool present = false; - for(int m = row_P[col_P[i]]; m < row_P[col_P[i] + 1]; m++) { - if(col_P[m] == n) present = true; - } - if(present) row_counts[n]++; - else { - row_counts[n]++; - row_counts[col_P[i]]++; - } - } +void symmetrizeMatrix(vector* _row_P, + vector* _col_P, vector* _val_P, + int N) { + // Get sparse matrix + vector& row_P = *_row_P; + vector& col_P = *_col_P; + vector& val_P = *_val_P; + + // Count number of elements and row counts of symmetric matrix + vector row_counts(N); + for (int n = 0; n < N; n++) { + for (int i = row_P[n]; i < row_P[n + 1]; i++) { + // Check whether element (col_P[i], n) is present + bool present = false; + for (int m = row_P[col_P[i]]; m < row_P[col_P[i] + 1]; m++) { + if (col_P[m] == n) + present = true; + } + if (present) + row_counts[n]++; + else { + row_counts[n]++; + row_counts[col_P[i]]++; + } } - int no_elem = 0; - for(int n = 0; n < N; n++) no_elem += row_counts[n]; - - // Allocate memory for symmetrized matrix - unsigned int* sym_row_P = (unsigned int*) malloc((N + 1) * sizeof(unsigned int)); - unsigned int* sym_col_P = (unsigned int*) malloc(no_elem * sizeof(unsigned int)); - double* sym_val_P = (double*) malloc(no_elem * sizeof(double)); - if(sym_row_P == NULL || sym_col_P == NULL || sym_val_P == NULL) { fprintf(stderr,"Memory allocation failed!\n"); exit(1); } - - // Construct new row indices for symmetric matrix - sym_row_P[0] = 0; - for(int n = 0; n < N; n++) sym_row_P[n + 1] = sym_row_P[n] + (unsigned int) row_counts[n]; - - // Fill the result matrix - int* offset = (int*) calloc(N, sizeof(int)); - if(offset == NULL) { fprintf(stderr,"Memory allocation failed!\n"); exit(1); } - for(int n = 0; n < N; n++) { - for(unsigned int i = row_P[n]; i < row_P[n + 1]; i++) { // considering element(n, col_P[i]) - - // Check whether element (col_P[i], n) is present - bool present = false; - for(unsigned int m = row_P[col_P[i]]; m < row_P[col_P[i] + 1]; m++) { - if(col_P[m] == n) { - present = true; - if(n <= col_P[i]) { // make sure we do not add elements twice - sym_col_P[sym_row_P[n] + offset[n]] = col_P[i]; - sym_col_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = n; - sym_val_P[sym_row_P[n] + offset[n]] = val_P[i] + val_P[m]; - sym_val_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = val_P[i] + val_P[m]; - } - } - } - - // If (col_P[i], n) is not present, there is no addition involved - if(!present) { - sym_col_P[sym_row_P[n] + offset[n]] = col_P[i]; - sym_col_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = n; - sym_val_P[sym_row_P[n] + offset[n]] = val_P[i]; - sym_val_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = val_P[i]; - } - - // Update offsets - if(!present || (present && n <= col_P[i])) { - offset[n]++; - if(col_P[i] != n) offset[col_P[i]]++; - } + } + int no_elem = 0; + for (int n = 0; n < N; n++) + no_elem += row_counts[n]; + + // Allocate memory for symmetrized matrix + vector sym_row_P(N + 1); + vector sym_col_P(no_elem); + vector sym_val_P(no_elem); + + // Construct new row indices for symmetric matrix + sym_row_P[0] = 0; + for (int n = 0; n < N; n++) + sym_row_P[n + 1] = sym_row_P[n] + (unsigned int)row_counts[n]; + + // Fill the result matrix + vector offset(N); + for (int n = 0; n < N; n++) { + for (unsigned int i = row_P[n]; i < row_P[n + 1]; i++) { + // considering element(n, col_P[i]) + + // Check whether element (col_P[i], n) is present + bool present = false; + for (unsigned int m = row_P[col_P[i]]; m < row_P[col_P[i] + 1]; m++) { + if (col_P[m] == n) { + present = true; + if (n <= col_P[i]) { // make sure we do not add elements twice + sym_col_P[sym_row_P[n] + offset[n]] = col_P[i]; + sym_col_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = n; + sym_val_P[sym_row_P[n] + offset[n]] = val_P[i] + val_P[m]; + sym_val_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = + val_P[i] + val_P[m]; + } } - } + } - // Divide the result by two - for(int i = 0; i < no_elem; i++) sym_val_P[i] /= 2.0; + // If (col_P[i], n) is not present, there is no addition involved + if (!present) { + sym_col_P[sym_row_P[n] + offset[n]] = col_P[i]; + sym_col_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = n; + sym_val_P[sym_row_P[n] + offset[n]] = val_P[i]; + sym_val_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = val_P[i]; + } - // Return symmetrized matrices - free(*_row_P); *_row_P = sym_row_P; - free(*_col_P); *_col_P = sym_col_P; - free(*_val_P); *_val_P = sym_val_P; + // Update offsets + if (!present || (present && n <= col_P[i])) { + offset[n]++; + if (col_P[i] != n) + offset[col_P[i]]++; + } + } + } + + // Divide the result by two + for (int i = 0; i < no_elem; i++) + sym_val_P[i] /= 2.0; - // Free up some memery - free(offset); offset = NULL; - free(row_counts); row_counts = NULL; + // Return symmetrized matrices + *_row_P = move(sym_row_P); + *_col_P = move(sym_col_P); + *_val_P = move(sym_val_P); } // Compute squared Euclidean distance matrix -void TSNE::computeSquaredEuclideanDistance(double* X, int N, int D, double* DD) { - const double* XnD = X; - for(int n = 0; n < N; ++n, XnD += D) { - const double* XmD = XnD + D; - double* curr_elem = &DD[n*N + n]; - *curr_elem = 0.0; - double* curr_elem_sym = curr_elem + N; - for(int m = n + 1; m < N; ++m, XmD+=D, curr_elem_sym+=N) { - *(++curr_elem) = 0.0; - for(int d = 0; d < D; ++d) { - *curr_elem += (XnD[d] - XmD[d]) * (XnD[d] - XmD[d]); - } - *curr_elem_sym = *curr_elem; - } +vector computeSquaredEuclideanDistance(double* X, int N, int D) { + vector DD(N * N); + const double* XnD = X; + for (int n = 0; n < N; ++n, XnD += D) { + const double* XmD = XnD + D; + double* curr_elem = &DD[n * N + n]; + *curr_elem = 0.0; + double* curr_elem_sym = curr_elem + N; + for (int m = n + 1; m < N; ++m, XmD += D, curr_elem_sym += N) { + double total = 0.0; + for (int d = 0; d < D; ++d) { + double diff = XnD[d] - XmD[d]; + total += diff * diff; + } + *(++curr_elem) = total; + *curr_elem_sym = *curr_elem; } + } + return DD; } - // Makes data zero-mean -void TSNE::zeroMean(double* X, int N, int D) { - - // Compute data mean - double* mean = (double*) calloc(D, sizeof(double)); - if(mean == NULL) { fprintf(stderr,"Memory allocation failed!\n"); exit(1); } - int nD = 0; - for(int n = 0; n < N; n++) { - for(int d = 0; d < D; d++) { - mean[d] += X[nD + d]; - } - nD += D; - } - for(int d = 0; d < D; d++) { - mean[d] /= (double) N; - } - - // Subtract data mean - nD = 0; - for(int n = 0; n < N; n++) { - for(int d = 0; d < D; d++) { - X[nD + d] -= mean[d]; - } - nD += D; - } - free(mean); mean = NULL; -} - - -// Generates a Gaussian random number -double TSNE::randn() { - double x, y, radius; - do { - x = 2 * (rand() / ((double) RAND_MAX + 1)) - 1; - y = 2 * (rand() / ((double) RAND_MAX + 1)) - 1; - radius = (x * x) + (y * y); - } while((radius >= 1.0) || (radius == 0.0)); - radius = sqrt(-2 * log(radius) / radius); - x *= radius; - y *= radius; - return x; +template +void zeroMean(double* X, int N) { + // Compute data mean + array mean; + mean.fill(0); + int nD = 0; + for (int n = 0; n < N; n++) { + for (int d = 0; d < D; d++) { + mean[d] += X[nD + d]; + } + nD += D; + } + for (int d = 0; d < D; d++) { + mean[d] /= (double)N; + } + + // Subtract data mean + nD = 0; + for (int n = 0; n < N; n++) { + for (int d = 0; d < D; d++) { + X[nD + d] -= mean[d]; + } + nD += D; + } } -// Function that loads data from a t-SNE file -// Note: this function does a malloc that should be freed elsewhere -bool TSNE::load_data(const char* dat_file, double** data, int* n, int* d, int* no_dims, double* theta, double* perplexity, int* rand_seed, int* max_iter) { - - // Open file, read first 2 integers, allocate memory, and read the data - FILE *h; - if((h = fopen(dat_file, "r+b")) == NULL) { - fprintf(stderr,"Error: could not open data file.\n"); - return false; - } - fread(n, sizeof(int), 1, h); // number of datapoints - fread(d, sizeof(int), 1, h); // original dimensionality - fread(theta, sizeof(double), 1, h); // gradient accuracy - fread(perplexity, sizeof(double), 1, h); // perplexity - fread(no_dims, sizeof(int), 1, h); // output dimensionality - fread(max_iter, sizeof(int),1,h); // maximum number of iterations - *data = (double*) malloc(*d * *n * sizeof(double)); - if(*data == NULL) { fprintf(stderr,"Memory allocation failed!\n"); exit(1); } - fread(*data, sizeof(double), *n * *d, h); // the data - if(!feof(h)) fread(rand_seed, sizeof(int), 1, h); // random seed - fclose(h); - fprintf(stderr,"Read the %i x %i data matrix successfully!\n", *n, *d); - return true; +// Makes data zero-mean +void zeroMean(double* X, int N, int D) { + // Compute data mean + vector mean(D); + int nD = 0; + for (int n = 0; n < N; n++) { + for (int d = 0; d < D; d++) { + mean[d] += X[nD + d]; + } + nD += D; + } + for (int d = 0; d < D; d++) { + mean[d] /= (double)N; + } + + // Subtract data mean + nD = 0; + for (int n = 0; n < N; n++) { + for (int d = 0; d < D; d++) { + X[nD + d] -= mean[d]; + } + nD += D; + } } -// Function that saves map to a t-SNE file -void TSNE::save_data(const char* res_file, double* data, int* landmarks, double* costs, int n, int d) { - - // Open file, write first 2 integers and then the data - FILE *h; - if((h = fopen(res_file, "w+b")) == NULL) { - fprintf(stderr,"Error: could not open data file.\n"); - return; - } - fwrite(&n, sizeof(int), 1, h); - fwrite(&d, sizeof(int), 1, h); - fwrite(data, sizeof(double), n * d, h); - fwrite(landmarks, sizeof(int), n, h); - fwrite(costs, sizeof(double), n, h); - fclose(h); - fprintf(stderr,"Wrote the %i x %i data matrix successfully!\n", n, d); +// Generates a Gaussian random number +double randn() { + double x, y, radius; + do { + x = 2 * (rand() / ((double)RAND_MAX + 1)) - 1; + y = 2 * (rand() / ((double)RAND_MAX + 1)) - 1; + radius = (x * x) + (y * y); + } while ((radius >= 1.0) || (radius == 0.0)); + radius = sqrt(-2 * log(radius) / radius); + x *= radius; + y *= radius; + return x; } -void TSNE::save_csv(const char* csv_file, double* Y, int N, int D) { - std::ofstream csv(csv_file); - - for (int d = 0; d < D; d++) { - csv << "TSNE" << d+1 << ","; +// Perform t-SNE +template +void run(double* X, int N, int D, double* Y, double perplexity, double theta, + int rand_seed, bool skip_random_init, double* init, bool use_init, + int max_iter, int stop_lying_iter, int mom_switch_iter) { + // Set random seed + if (skip_random_init != true) { + if (rand_seed >= 0) { + fprintf(stderr, "Using random seed: %d\n", rand_seed); + srand((unsigned int)rand_seed); + } else { + fprintf(stderr, "Using current time as random seed...\n"); + srand(time(nullptr)); } - csv << "\n"; - + } + + // Determine whether we are using an exact algorithm + if (N - 1 < 3 * perplexity) { + fprintf(stderr, "Perplexity too large for the number of data points!\n"); + exit(1); + } + fprintf(stderr, "Using no_dims = %d, perplexity = %f, and theta = %f\n", + NDIMS, perplexity, theta); + bool exact = (theta == .0) ? true : false; + + fprintf(stderr, + "Using max_iter = %d, stop_lying_iter = %d, mom_switch_iter = %d\n", + max_iter, stop_lying_iter, mom_switch_iter); + + // Set learning parameters + float total_time = .0; + clock_t start, end; + double momentum = .5, final_momentum = .8; + double eta = 200.0; + + // Allocate some memory + vector dY(N * NDIMS); + vector uY(N * NDIMS); + vector gains(N * NDIMS, 1.0); + + // Normalize input data (to prevent numerical problems) + fprintf(stderr, "Computing input similarities...\n"); + start = clock(); + zeroMean(X, N, D); + double max_X = .0; + for (int i = 0; i < N * D; i++) { + auto ax = abs(X[i]); + if (ax > max_X) + max_X = ax; + } + for (int i = 0; i < N * D; i++) + X[i] /= max_X; + + // Compute input similarities for exact t-SNE + vector P; + vector row_P; + vector col_P; + vector val_P; + if (exact) { + // Compute similarities + fprintf(stderr, "Exact?"); + P.resize(N * N); + computeGaussianPerplexity(X, N, D, P.data(), perplexity); + + // Symmetrize input similarities + fprintf(stderr, "Symmetrizing...\n"); + int nN = 0; for (int n = 0; n < N; n++) { - int row_offset = n * D; - for (int d = 0; d < D; d++) { - csv << Y[row_offset + d] << ","; - } - csv << "\n"; + int mN = nN + N; + for (int m = n + 1; m < N; m++) { + P[nN + m] += P[mN + n]; + P[mN + n] = P[nN + m]; + mN += N; + } + nN += N; + } + double sum_P = .0; + for (int i = 0; i < N * N; i++) + sum_P += P[i]; + for (int i = 0; i < N * N; i++) + P[i] /= sum_P; + } else { + // Compute input similarities for approximate t-SNE + + // Compute asymmetric pairwise input similarities + computeGaussianPerplexity(X, N, D, &row_P, &col_P, &val_P, perplexity, + (int)(3 * perplexity)); + + // Symmetrize input similarities + symmetrizeMatrix(&row_P, &col_P, &val_P, N); + double sum_P = .0; + for (int i = 0; i < row_P[N]; i++) + sum_P += val_P[i]; + for (int i = 0; i < row_P[N]; i++) + val_P[i] /= sum_P; + } + end = clock(); + + // Lie about the P-values + if (exact) { + for (int i = 0; i < N * N; i++) + P[i] *= 12.0; + } else { + for (int i = 0; i < row_P[N]; i++) + val_P[i] *= 12.0; + } + + // Initialize solution (randomly or with given coordinates) + if (!use_init && !skip_random_init) { + for (int i = 0; i < N * NDIMS; i++) { + Y[i] = randn() * .0001; + } + } else if (use_init) { + for (int i = 0; i < N * NDIMS; i++) { + Y[i] = init[i]; + } + } + + // Perform main training loop + if (exact) + fprintf(stderr, + "Input similarities computed in %4.2f seconds!\nLearning " + "embedding...\n", + (float)(end - start) / CLOCKS_PER_SEC); + else + fprintf(stderr, + "Input similarities computed in %4.2f seconds (sparsity = " + "%f)!\nLearning embedding...\n", + (float)(end - start) / CLOCKS_PER_SEC, + (double)row_P[N] / ((double)N * (double)N)); + start = clock(); + + for (int iter = 0; iter < max_iter; iter++) { + // Compute (approximate) gradient + if (exact) + computeExactGradient(P, Y, N, dY); + else + computeGradient(row_P, col_P, val_P, Y, N, dY, theta); + + // Update gains + for (int i = 0; i < N * NDIMS; i++) + gains[i] = + (sign(dY[i]) != sign(uY[i])) ? (gains[i] + .2) : (gains[i] * .8); + for (int i = 0; i < N * NDIMS; i++) + if (gains[i] < .01) + gains[i] = .01; + + // Perform gradient update (with momentum and gains) + for (int i = 0; i < N * NDIMS; i++) + uY[i] = momentum * uY[i] - eta * gains[i] * dY[i]; + for (int i = 0; i < N * NDIMS; i++) + Y[i] = Y[i] + uY[i]; + + // Make solution zero-mean + zeroMean(Y, N); + + // Stop lying about the P-values after a while, and switch momentum + if (iter == stop_lying_iter) { + if (exact) { + for (int i = 0; i < N * N; i++) + P[i] /= 12.0; + } else { + for (int i = 0; i < row_P[N]; i++) + val_P[i] /= 12.0; + } + } + if (iter == mom_switch_iter) + momentum = final_momentum; + + // Print out progress + if (iter > 0 && (iter % 50 == 0 || iter == max_iter - 1)) { + end = clock(); + double C = .0; + if (exact) { + C = evaluateError(P, Y, N); + } else { + // doing approximate computation here! + C = evaluateError(row_P, col_P, val_P, Y, N, theta); + } + if (iter == 0) + fprintf(stderr, "Iteration %d: error is %f\n", iter + 1, C); + else { + total_time += (float)(end - start) / CLOCKS_PER_SEC; + fprintf(stderr, + "Iteration %d: error is %f (50 iterations in %4.2f seconds)\n", + iter, C, (float)(end - start) / CLOCKS_PER_SEC); + } + start = clock(); } + } + end = clock(); + total_time += (float)(end - start) / CLOCKS_PER_SEC; - csv.close(); + fprintf(stderr, "Fitting performed in %4.2f seconds.\n", total_time); } -// Function that runs the Barnes-Hut implementation of t-SNE -int main(int argc, char *argv[]) { +} // namespace + +extern "C" { + +void DLL_PUBLIC run(double* X, int N, int D, double* Y, int no_dims, + double perplexity, double theta, int rand_seed, + bool skip_random_init, double* init, bool use_init, + int max_iter, int stop_lying_iter, int mom_switch_iter) { + assert(no_dims == 2 || no_dims == 3); + switch (no_dims) { + case 2: + run<2>(X, N, D, Y, perplexity, theta, rand_seed, skip_random_init, init, + use_init, max_iter, stop_lying_iter, mom_switch_iter); + break; + case 3: + run<3>(X, N, D, Y, perplexity, theta, rand_seed, skip_random_init, init, + use_init, max_iter, stop_lying_iter, mom_switch_iter); + break; + default: + throw "invalid dimension"; + } +} - // load input and output - std::string dat_file = "data.dat"; - std::string res_file = "result.dat"; - if (argc > 1) { - dat_file = argv[1]; - res_file = argv[2]; - } +} // extern "C" - const char *dat_file_c = dat_file.c_str(); - const char *res_file_c = res_file.c_str(); - - // Define some variables - int origN, N, D, no_dims, max_iter, *landmarks; - double perc_landmarks; - double perplexity, theta, *data; - int rand_seed = -1; - TSNE* tsne = new TSNE(); - - // Read the parameters and the dataset - if(tsne->load_data(dat_file_c, &data, &origN, &D, &no_dims, &theta, &perplexity, &rand_seed, &max_iter)) { - - // Make dummy landmarks - N = origN; - int* landmarks = (int*) malloc(N * sizeof(int)); - if(landmarks == NULL) { fprintf(stderr,"Memory allocation failed!\n"); exit(1); } - for(int n = 0; n < N; n++) landmarks[n] = n; - - // Now fire up the SNE implementation - double* Y = (double*) malloc(N * no_dims * sizeof(double)); - double* costs = (double*) calloc(N, sizeof(double)); - if(Y == NULL || costs == NULL) { fprintf(stderr,"Memory allocation failed!\n"); exit(1); } - tsne->run(data, N, D, Y, no_dims, perplexity, theta, rand_seed, false, NULL, false, max_iter); - - // Save the results - tsne->save_data(res_file_c, Y, landmarks, costs, N, no_dims); - - // Clean up the memory - free(data); data = NULL; - free(Y); Y = NULL; - free(costs); costs = NULL; - free(landmarks); landmarks = NULL; - } - delete(tsne); -} +#pragma GCC visibility pop diff --git a/tsne/bh_sne_src/tsne.h b/tsne/bh_sne_src/tsne.h index 93e203e..6ec8e05 100644 --- a/tsne/bh_sne_src/tsne.h +++ b/tsne/bh_sne_src/tsne.h @@ -12,7 +12,8 @@ * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: - * This product includes software developed by the Delft University of Technology. + * This product includes software developed by the Delft University of + * Technology. * 4. Neither the name of the Delft University of Technology nor the names of * its contributors may be used to endorse or promote products derived from * this software without specific prior written permission. @@ -20,45 +21,40 @@ * THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO - * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY - * OF SUCH DAMAGE. + * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ - #ifndef TSNE_H #define TSNE_H +#if defined _WIN32 || defined __CYGWIN__ +#ifdef __GNUC__ +#define DLL_PUBLIC __attribute__((dllexport)) +#else +#define DLL_PUBLIC __declspec(dllexport) +#endif +#else +#if __GNUC__ >= 4 +#define DLL_PUBLIC __attribute__((visibility("default"))) +#else +#define DLL_PUBLIC +#endif +#endif -static inline double sign(double x) { return (x == .0 ? .0 : (x < .0 ? -1.0 : 1.0)); } - - -class TSNE -{ -public: - void run(double* X, int N, int D, double* Y, int no_dims, double perplexity, double theta, int rand_seed, - bool skip_random_init, double *init, bool use_init, int max_iter=1000, int stop_lying_iter=250, int mom_switch_iter=250 - ); - bool load_data(const char* dat_file, double** data, int* n, int* d, int* no_dims, double* theta, double* perplexity, int* rand_seed, int* max_iter); - void save_data(const char* res_file, double* data, int* landmarks, double* costs, int n, int d); - void symmetrizeMatrix(unsigned int** row_P, unsigned int** col_P, double** val_P, int N); // should be static! - void save_csv(const char* csv_file, double* Y, int N, int D); +extern "C" { -private: - void computeGradient(double* P, unsigned int* inp_row_P, unsigned int* inp_col_P, double* inp_val_P, double* Y, int N, int D, double* dC, double theta); - void computeExactGradient(double* P, double* Y, int N, int D, double* dC); - double evaluateError(double* P, double* Y, int N, int D); - double evaluateError(unsigned int* row_P, unsigned int* col_P, double* val_P, double* Y, int N, int D, double theta); - void zeroMean(double* X, int N, int D); - void computeGaussianPerplexity(double* X, int N, int D, double* P, double perplexity); - void computeGaussianPerplexity(double* X, int N, int D, unsigned int** _row_P, unsigned int** _col_P, double** _val_P, double perplexity, int K); - void computeSquaredEuclideanDistance(double* X, int N, int D, double* DD); - double randn(); -}; +void DLL_PUBLIC run(double* X, int N, int D, double* Y, int no_dims, + double perplexity, double theta, int rand_seed, + bool skip_random_init, double* init, bool use_init, + int max_iter = 1000, int stop_lying_iter = 250, + int mom_switch_iter = 250); +} #endif diff --git a/tsne/bh_sne_src/vptree.h b/tsne/bh_sne_src/vptree.h index 9e301d6..c9387ce 100644 --- a/tsne/bh_sne_src/vptree.h +++ b/tsne/bh_sne_src/vptree.h @@ -12,261 +12,234 @@ * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: - * This product includes software developed by the Delft University of Technology. - * 4. Neither the name of the Delft University of Technology nor the names of - * its contributors may be used to endorse or promote products derived from + * This product includes software developed by the Delft University of + * Technology. + * 4. Neither the name of the Delft University of Technology nor the names of + * its contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS - * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO - * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY - * OF SUCH DAMAGE. + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ +/* This code was adopted with minor modifications from Steve Hanov's great + * tutorial at http://stevehanov.ca/blog/index.php?id=130 */ -/* This code was adopted with minor modifications from Steve Hanov's great tutorial at http://stevehanov.ca/blog/index.php?id=130 */ +#ifndef VPTREE_H +#define VPTREE_H -#include #include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include +#pragma GCC visibility push(hidden) -#ifndef VPTREE_H -#define VPTREE_H +class euclidean_distance final { + const int _D; + const double* data; -class DataPoint -{ - int _ind; - -public: - double* _x; - int _D; - DataPoint() { - _D = 1; - _ind = -1; - _x = NULL; - } - DataPoint(int D, int ind, double* x) { - _D = D; - _ind = ind; - _x = (double*) malloc(_D * sizeof(double)); - for(int d = 0; d < _D; d++) _x[d] = x[d]; + public: + explicit euclidean_distance(int D, const double* data) : _D(D), data(data) { + } + euclidean_distance(euclidean_distance&&) = default; + euclidean_distance(const euclidean_distance&) = default; + double operator()(const unsigned int t1, const unsigned int t2) const { + double dd = .0; + const double* x1 = data+t1*_D; + const double* x2 = data+t2*_D; + for (int d = 0; d < _D; d++) { + double diff = (x1[d] - x2[d]); + dd += diff * diff; } - DataPoint(const DataPoint& other) { // this makes a deep copy -- should not free anything - if(this != &other) { - _D = other.dimensionality(); - _ind = other.index(); - _x = (double*) malloc(_D * sizeof(double)); - for(int d = 0; d < _D; d++) _x[d] = other.x(d); - } + return sqrt(dd); + } +}; + +template +class VpTree { + public: + // Default constructor + explicit VpTree(Distance&& distance) : distance(distance){}; + + // Destructor + ~VpTree() = default; + + // Function to create a new VpTree from data + void create(const std::vector& items) { + _items = items; + _root = buildFromPoints(0, items.size()); + } + + // Function that uses the tree to find the k nearest neighbors of target + void search(const T& target, int k, std::vector* results, + std::vector* distances) { + // Use a priority queue to store intermediate results on + std::priority_queue heap; + + // Variable that tracks the distance to the farthest point in our results + _tau = DBL_MAX; + + // Perform the search + search(_root.get(), target, k, heap); + + // Gather final results + results->clear(); + distances->clear(); + while (!heap.empty()) { + results->push_back(_items[heap.top().index]); + distances->push_back(heap.top().dist); + heap.pop(); } - ~DataPoint() { if(_x != NULL) free(_x); } - DataPoint& operator= (const DataPoint& other) { // asignment should free old object - if(this != &other) { - if(_x != NULL) free(_x); - _D = other.dimensionality(); - _ind = other.index(); - _x = (double*) malloc(_D * sizeof(double)); - for(int d = 0; d < _D; d++) _x[d] = other.x(d); - } - return *this; + + // Results are in reverse order + std::reverse(results->begin(), results->end()); + std::reverse(distances->begin(), distances->end()); + } + + private: + const Distance distance; + std::vector _items; + double _tau; + + // Single node of a VP tree (has a point and radius; left children are closer + // to point than the radius) + struct Node { + int index; // index of point in node + double threshold; // radius(?) + std::unique_ptr left; // points closer by than threshold + std::unique_ptr right; // points farther away than threshold + + Node() : index(0), threshold(0.) { } - int index() const { return _ind; } - int dimensionality() const { return _D; } - double x(int d) const { return _x[d]; } -}; -double euclidean_distance(const DataPoint &t1, const DataPoint &t2) { - double dd = .0; - double* x1 = t1._x; - double* x2 = t2._x; - double diff; - for(int d = 0; d < t1._D; d++) { - diff = (x1[d] - x2[d]); - dd += diff * diff; + ~Node() = default; + }; + std::unique_ptr _root; + + // An item on the intermediate result queue + struct HeapItem { + HeapItem(int index, double dist) : index(index), dist(dist) { } - return sqrt(dd); -} - - -template -class VpTree -{ -public: - - // Default constructor - VpTree() : _root(0) {} - - // Destructor - ~VpTree() { - delete _root; + int index; + double dist; + bool operator<(const HeapItem& o) const { + return dist < o.dist; } + }; - // Function to create a new VpTree from data - void create(const std::vector& items) { - delete _root; - _items = items; - _root = buildFromPoints(0, items.size()); + // Function that (recursively) fills the tree + std::unique_ptr buildFromPoints(int lower, int upper) { + if (upper == lower) { // indicates that we're done here! + return nullptr; } - - // Function that uses the tree to find the k nearest neighbors of target - void search(const T& target, int k, std::vector* results, std::vector* distances) - { - - // Use a priority queue to store intermediate results on - std::priority_queue heap; - - // Variable that tracks the distance to the farthest point in our results - _tau = DBL_MAX; - - // Perform the search - search(_root, target, k, heap); - - // Gather final results - results->clear(); distances->clear(); - while(!heap.empty()) { - results->push_back(_items[heap.top().index]); - distances->push_back(heap.top().dist); - heap.pop(); - } - - // Results are in reverse order - std::reverse(results->begin(), results->end()); - std::reverse(distances->begin(), distances->end()); + + // Lower index is center of current node + std::unique_ptr node = std::make_unique(); + node->index = lower; + + if (upper - lower > 1) { // if we did not arrive at leaf yet + + // Choose an arbitrary point and move it to the start + int i = (int)((double)rand() / RAND_MAX * (upper - lower - 1)) + lower; + std::swap(_items[lower], _items[i]); + + // Partition around the median distance + int median = (upper + lower) / 2; + auto& lower_item = _items[lower]; + std::nth_element( + _items.begin() + lower + 1, _items.begin() + median, + _items.begin() + upper, [this, lower_item](auto& x, auto& y) { + return distance(lower_item, x) < distance(lower_item, y); + }); + + // Threshold of the new node will be the distance to the median + node->threshold = distance(lower_item, _items[median]); + + // Recursively build tree + node->index = lower; + node->left = buildFromPoints(lower + 1, median); + node->right = buildFromPoints(median, upper); } - -private: - std::vector _items; - double _tau; - - // Single node of a VP tree (has a point and radius; left children are closer to point than the radius) - struct Node - { - int index; // index of point in node - double threshold; // radius(?) - Node* left; // points closer by than threshold - Node* right; // points farther away than threshold - - Node() : - index(0), threshold(0.), left(0), right(0) {} - - ~Node() { // destructor - delete left; - delete right; - } - }* _root; - - - // An item on the intermediate result queue - struct HeapItem { - HeapItem( int index, double dist) : - index(index), dist(dist) {} - int index; - double dist; - bool operator<(const HeapItem& o) const { - return dist < o.dist; - } - }; - - // Distance comparator for use in std::nth_element - struct DistanceComparator - { - const T& item; - DistanceComparator(const T& item) : item(item) {} - bool operator()(const T& a, const T& b) { - return distance(item, a) < distance(item, b); - } - }; - - // Function that (recursively) fills the tree - Node* buildFromPoints( int lower, int upper ) - { - if (upper == lower) { // indicates that we're done here! - return NULL; - } - - // Lower index is center of current node - Node* node = new Node(); - node->index = lower; - - if (upper - lower > 1) { // if we did not arrive at leaf yet - - // Choose an arbitrary point and move it to the start - int i = (int) ((double)rand() / RAND_MAX * (upper - lower - 1)) + lower; - std::swap(_items[lower], _items[i]); - - // Partition around the median distance - int median = (upper + lower) / 2; - std::nth_element(_items.begin() + lower + 1, - _items.begin() + median, - _items.begin() + upper, - DistanceComparator(_items[lower])); - - // Threshold of the new node will be the distance to the median - node->threshold = distance(_items[lower], _items[median]); - - // Recursively build tree - node->index = lower; - node->left = buildFromPoints(lower + 1, median); - node->right = buildFromPoints(median, upper); - } - - // Return result - return node; + + // Return result + return node; + } + + // Helper function that searches the tree + void search(const Node* node, const T& target, int k, + std::priority_queue& heap) { + if (node == nullptr) + return; // indicates that we're done here + + // Compute distance between target and current node + double dist = distance(_items[node->index], target); + + // If current node within radius tau + if (dist < _tau) { + if (heap.size() == k) { + // remove furthest node from result list (if we already + // have k results) + heap.pop(); + } + // add current node to result list + heap.push(HeapItem(node->index, dist)); + if (heap.size() == k) { + // update value of tau (farthest point in result list) + _tau = heap.top().dist; + } } - - // Helper function that searches the tree - void search(Node* node, const T& target, int k, std::priority_queue& heap) - { - if(node == NULL) return; // indicates that we're done here - - // Compute distance between target and current node - double dist = distance(_items[node->index], target); - - // If current node within radius tau - if(dist < _tau) { - if(heap.size() == k) heap.pop(); // remove furthest node from result list (if we already have k results) - heap.push(HeapItem(node->index, dist)); // add current node to result list - if(heap.size() == k) _tau = heap.top().dist; // update value of tau (farthest point in result list) - } - - // Return if we arrived at a leaf - if(node->left == NULL && node->right == NULL) { - return; - } - - // If the target lies within the radius of ball - if(dist < node->threshold) { - if(dist - _tau <= node->threshold) { // if there can still be neighbors inside the ball, recursively search left child first - search(node->left, target, k, heap); - } - - if(dist + _tau >= node->threshold) { // if there can still be neighbors outside the ball, recursively search right child - search(node->right, target, k, heap); - } - - // If the target lies outsize the radius of the ball - } else { - if(dist + _tau >= node->threshold) { // if there can still be neighbors outside the ball, recursively search right child first - search(node->right, target, k, heap); - } - - if (dist - _tau <= node->threshold) { // if there can still be neighbors inside the ball, recursively search left child - search(node->left, target, k, heap); - } - } + + // Return if we arrived at a leaf + if (!node->left && !node->right) { + return; } + + // If the target lies within the radius of ball + if (dist < node->threshold) { + if (dist - _tau <= node->threshold) { + // if there can still be neighbors inside the + // ball, recursively search left child first + search(node->left.get(), target, k, heap); + } + + if (dist + _tau >= node->threshold) { + // if there can still be neighbors outside the + // ball, recursively search right child + search(node->right.get(), target, k, heap); + } + + // If the target lies outsize the radius of the ball + } else { + if (dist + _tau >= node->threshold) { + // if there can still be neighbors outside the + // ball, recursively search right child first + search(node->right.get(), target, k, heap); + } + + if (dist - _tau <= node->threshold) { + // if there can still be neighbors inside the + // ball, recursively search left child + search(node->left.get(), target, k, heap); + } + } + } + + VpTree(const VpTree&) = delete; + VpTree& operator=(const VpTree&) = delete; }; - -#endif + +#pragma GCC visibility pop +#endif // !defined(VPTREE_H) diff --git a/tsne/tests/test_stability.npy b/tsne/tests/test_stability.npy new file mode 100644 index 0000000..25cd4de Binary files /dev/null and b/tsne/tests/test_stability.npy differ diff --git a/tsne/tests/test_stability.py b/tsne/tests/test_stability.py new file mode 100644 index 0000000..b78edc0 --- /dev/null +++ b/tsne/tests/test_stability.py @@ -0,0 +1,20 @@ +def test_seed(): + from tsne import bh_sne + from sklearn.datasets import load_iris + import numpy as np + import os.path + from os import environ + + if 'CI' in environ: + # too many different compiler differences causing numeric instability. + return + + iris = load_iris() + + X = iris.data + y = iris.target + + t1 = bh_sne(X, random_state=np.random.RandomState(0), copy_data=True) + t2 = np.load(os.path.join(os.path.dirname(__file__), "test_stability.npy")) + + assert np.all(t1 == t2)