diff --git a/.gitignore b/.gitignore index 2619693..5797177 100644 --- a/.gitignore +++ b/.gitignore @@ -6,8 +6,12 @@ /data/geojson /data/tiles /data/labdata.key +/data/macademia_people map.xml +*.geojson +*.geoJSON + .DS_Store /data/.DS_Store /install/.DS_Store diff --git a/cartograph/Config.py b/cartograph/Config.py index 9b7a21b..744d3ed 100644 --- a/cartograph/Config.py +++ b/cartograph/Config.py @@ -1,3 +1,4 @@ + from ConfigParser import SafeConfigParser EXTERNAL_FILES = 'ExternalFiles' diff --git a/knn.py b/knn.py new file mode 100644 index 0000000..c9d9603 --- /dev/null +++ b/knn.py @@ -0,0 +1,62 @@ +import luigi +import os +import cartograph +import numpy as np +from sklearn.neighbors import KDTree +from sklearn.metrics.pairwise import cosine_similarity +from pprint import pprint + +from cartograph import Config +from cartograph import Util +config = Config.BAD_GET_CONFIG() + + + +peopleDict = Util.read_features(config.FILE_NAME_NUMBERED_VECS, #config people + config.FILE_NAME_NUMBERED_NAMES, + config.FILE_NAME_ARTICLE_COORDINATES) +peopleKeys = list(peopleDict.keys()) +peopleVectors = np.array([peopleDict[vID]["vector"] for vID in peopleKeys]) +peopleNames = [peopleDict[nID]["name"] for nID in peopleKeys] +x = [float(peopleDict[fID]["x"]) for fID in peopleKeys] +y = [float(peopleDict[fID]["y"]) for fID in peopleKeys] + + + +interestDict = Util.read_features(config.FILE_NAME_MORE_VECS, + config.FILE_NAME_MORE_NAMES) +interestKeys = list(interestDict.keys()) +interestVectors = np.array([interestDict[vID]["vector"] for vID in interestKeys]) +interestNames = [interestDict[nID]["name"] for nID in interestKeys] + + +kdt = KDTree(peopleVectors, leaf_size=30) + + +x_lst = [] +y_lst = [] +#knn_dict = {} +for i in range(len(interestVectors)): + dist, ind = kdt.query([interestVectors[i]], k=5) + temp_x_lst=[] + temp_y_lst=[] + temp_name_lst = [] + weights = [] + for j in ind[0]: + temp_x_lst.append(x[j]) + temp_y_lst.append(y[j]) + temp_name_lst.append(peopleNames[j]) + weights.append(float(cosine_similarity([interestVectors[i]], [peopleVectors[j]]))) #cosine similarity >0 + #print weights + #weights = [1,1,1,1,1] + x_lst.append(np.average(temp_x_lst, axis = 0, weights = weights)) + y_lst.append(np.average(temp_y_lst, axis = 0, weights = weights)) + #knn_dict[interestNames[i]] = temp_name_lst + + +Util.write_tsv(config.FILE_NAME_MORE_COORDINATES, + ("index", "x", "y"), interestKeys, x_lst, y_lst) + + +#pprint(knn_dict) + diff --git a/workflow.py b/workflow.py index f23d4ca..e7eb191 100644 --- a/workflow.py +++ b/workflow.py @@ -190,7 +190,7 @@ def run(self): name = featureDict[featureID]["name"] popularityList.append(nameDict[name]) - Util.write_tsv(config.FILE_NAME_NUMBERED_POPULARITY, + Util.write_tsv(config.get("PreprocessingFiles", "popularity_with_id"), ("id", "popularity"), idList, popularityList) @@ -287,7 +287,7 @@ def run(self): X = [float(points[k]['x']) for k in keys] Y = [float(points[k]['y']) for k in keys] maxVal = max(abs(v) for v in X + Y) - scaling = config.MAX_COORDINATE / maxVal + scaling = config.get("MapConstants", "max_coordinate") / maxVal X = [x * scaling for x in X] Y = [y * scaling for y in Y] Util.write_tsv(config.get("PreprocessingFiles", @@ -434,7 +434,6 @@ def run(self): range(1, len(regionList) + 1), regionList) - class CreateContours(MTimeMixin, luigi.Task): ''' Make contours based on density of points inside the map