-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdatascrape.py
More file actions
146 lines (124 loc) · 4.57 KB
/
datascrape.py
File metadata and controls
146 lines (124 loc) · 4.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 2 05:46:30 2016
@author: jessica
Module used to download and obtain image links from bing
Also includes methods to download from Krumbs
"""
import py_bing_search as bing
import urllib
import json
###############################################################################
#DATASCRAPING
###############################################################################
def bing_image_search(query, output_filename, lim):
"""
search bing for lim number of images matching the query and output the
following information into the given output filename
1. title
2. url
3. width
4. height
5. size
6. content type
Parameters
------------
output_filename: str: file to output to
query: str: search term/terms (space delimited)
limi: number of results we want
Returns
------------
int: number of items added to output_filename
"""
if(type(query) != str):
raise TypeError("query must be of type string")
if(type(lim) != int or lim <= 0):
raise ValueError("limit must be positive integer")
searcher = bing.PyBingImageSearch(query)
result = searcher.search_all(limit=lim, format='json')
bing.image_results_to_file(result, output_filename, append=True)
return len(result)
#Note: to do image classification on a list of links, it is best to run the following
#>>> a = _get_image_links
#>>> result = gClassify.get_collection_tags(your_image_link, rel_threshold=0.3, current_dict=your_default_dict)
def _get_image_links(filename, delimiter=";;;", index=1):
"""
pulls imagelinks from given filename that is in some delimited format
Parameters
------------
filename:str: path to txt file
delimeter:str: separating character string used to separate pieces of info in same entry
index: index of delimited strings where link is
Returns
----------
list[str]: list of pulled links
"""
ans = []
with open(filename, 'r') as myFile:
for line in myFile:
temp = line.split(delimiter)
ans.append(temp[index])
return ans
def jsonUrl(url):
'''
Accesses the JSON retuned from a given url
Parameter: url(str)
Returns: list[dict]
'''
response = urllib.urlopen(url)
data = json.loads(response.read())
return data
#http://sln.ics.uci.edu:8085/eventshoplinux/rest/sttwebservice/search/70/box/33.554234134963096,-117.96170501708986/33.782820548089866,-117.67674713134767/1456531200000/1457740800000
#1464739200000/1464825600000
def krumbsJsonFeatures(url):
"""
Accesses the eventshop JSON array from given url and exxtracts the given
features (not in any particular order)
1. id
2. time
3. image_url
4. intent (name, synonym)
5. geo:(lat, lng) #may or may not be there
6. caption: #may or may not be there
Returns
-------------
list[dict{str:str}]
"""
response = urllib.urlopen(url)
data = json.loads(response.read())
toReturn = [None]*len(data)
for i in range(len(data)):
print "we are on index ", i
toReturn[i] = {}
toReturn[i]["id"] = data[i]["stt_id"]
try: #geo coordinates
toReturn[i]["geo"] = tuple(data[i]["stt_where"]["point"])
except:
pass
toReturn[i]["time"] = data[i]["stt_when"]["datetime"]
try: #image url
toReturn[i]["image_url"] = data[i]["stt_what"]["media_source"]["value"]
except:
toReturn[i]["image_url"] = data[i]["stt_what"]["media_source_photo"]["value"]
try: #intent
toReturn[i]["intent"] = (data[i]["stt_what"]["intent_name"]["value"], data[i]["stt_what"]["intent_used_synonym"]["value"])
except:
toReturn[i]["intent"] = (None, data[i]["stt_what"]["intent_used_synonym"]["value"])
try: #caption
toReturn[i]["caption"] = data[i]["stt_what"]["caption"]["value"]
except:
pass
try: #location
toReturn[i]["location"] = data[i]["raw_data"]["media"][0]["where"]["revgeo_places"][0]["name"]
except:
print "AAAAHH locations: ", data[i]["raw_data"]
print "type of location", type(data[i]["raw_data"])
return toReturn
def extract_krumbs_links(k_list):
"""
from list of krumbs json dictionaries, extract just the link and returns a list of links
"""
ans = []
for item in k_list:
ans.append(item["image_url"])
return ans