chaincrawler/chainCrawler.py at master · davidbramsay/chaincrawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
#!/usr/bin/python
'''
This is a webcrawler for Chain-API. (https://.github.com/ResEnv/chain-api)

To make sure it doesn't revisit URIs, it creates a hash table where it stores
64 bit hashes for the URI, created by google's fast hash cityHash64.  This is
indexed by the last several digits of the hash.  When a hash collision occurs,
the new hash value simply overwrites the previous.  Locality of URIs should
allow this to work for storing non-colliding hashes of the most recent URIs.

ex:  'http://test.com' hashes to '0x1234567887654321', and the cache table size
is 2^8, or 256, so we apply an 8 bit mask of 0xff (& 255) to the hash. This
gives us hashtable[0x21] = 0x1234567887654321.  Whenever we touch a new URI,
we check the masked portion of the URI's hash to see if it matches the stored
hash.  If it does, we skip it.  If it doesn't or it doesn't exist, we crawl
the page and overwrite the hash value there.

Hash Table and Algorithm have been optimized with external C libraries for
size and speed, and preallocated.

TODO: restructure for parallelism:
 -SHARED CACHE OF VISITS
 -IF CACHE DOESN'T CHANGE FOR A LONG TIME, CLEAR
 -MAIN ENTRYPOINT-> SPIN UP SEVERAL CONCURRENT CRAWLERS (set #)
 -EACH CRAWLER UPDATES ENTRYPOINT
 -EACH CRAWLER DEPTH FIRST SEARCH WITH SOME MAX DEPTH STACK, FILO, IF FINISHED
  AND NOT POPPING ENTRYPOINT, GO BACK TO ENTRYPOINT AND START AGAIN.
 -IF ALL CHILD SITES VISITED, RANDOMLY PICK ONE.


depth first search with given depth 'memory'
expose queue of resources/links to matching rel namespace and resource type

-get links to eternal resources, eliminate any in depth memory (where you came from)
-compare against search criteria, push matching to external queue
-randomly select one resource link if any exist, compare against hashes, if not hashed follow
-if hashed, select from remaining links and compare, if not hashed follow. repeat until all exhausted
-if all hashed from current resource, move back up depth memory one resource and repeat
-if we exhaust full depth history and all are hashed, go back to entrypoint and start over
-if we are at the entrypoint and try to go back, clear hash table
-delay between access

'''

from crawlerCache import CrawlerCacheWithCollisionHistory
from leakyLIFO import LeakyLIFO
from timeDecaySet import TimeDecaySet
from globalConfig import log
import re
import time
import random
import requests
import threading
import Queue
import zmq


class ChainCrawler(object):


    def __init__(self, entry_point='http://learnair.media.mit.edu:8000/', \
            cache_table_mask_length=8, track_search_depth=5, \
            found_set_persistence=720, crawl_delay=1000, filter_keywords=['previous','next']):
        #entry_point = starting URL for crawl
        #search_depth = how many steps in path we save to retrace when at a dead end
        #found_set_persistence = how long, in min,  to keep a resource URI in memory
        #       before it is allowed to be returned as a new resource again.  720= 12
        #       hours before crawler 'forgets' it has seen something and resubmits it
        #       in the queue to be processed
        #crawl_delay = how long, in ms, before accessing/crawling a new resource

        self.entry_point = entry_point #entry point URI

        #initialize crawl variables
        self.current_uri = entry_point #keep track of current location
        self.current_uri_type = 'entry_point'
        self.current_uri_title = 'entry_point'
        self.crawl_history = LeakyLIFO(track_search_depth) #keep track of past
        self.crawl_delay = crawl_delay #in milliseconds
        self.found_resources = TimeDecaySet(found_set_persistence) #in seconds

        #initialize cache
        self.cache = CrawlerCacheWithCollisionHistory(cache_table_mask_length)

        #initialize queue/zmq variables
        self.q = None
        self.zmq = None

        self.find_called = False

        #initialize filter word list for crawling
        self.filter_keywords = ['edit','create','self','curies','websocket']
        [self.filter_keywords.append(x) for x in filter_keywords]
        log.debug( "filter keywords %s", self.filter_keywords)

        log.info( "-----------------------------------------------" )
        log.info( "Crawler Initialized." )
        log.info( "Entry Point: %s", self.entry_point )
        log.info( "-----------------------------------------------" )


    @staticmethod
    def apply_hal_curies(json, del_curies=True):
        '''Find and apply CURIES relationship shorcuts (namespace/rel
        definitions) to other links in the json object. I.E., if we have
        a CURIES "http://learnair.media.mit.edu/rels/{rel}" with name "ch",
        and a link further called 'ch:sites', remove the CURIES part of the
        object and apply it so that 'ch:sites' is now "http://learnair.media
        .mit.edu/rels/sites". del_curies tells this function whether to
        remove the CURIES section of _links after applying it to the document
        (True), or whether to leave it in (False).'''

        try:
            curies = json['_links']['curies'] #find the curies.

            for curie in curies: #compare each curies name...
                for key in json['_links']: #...with each link relationship

                    #if we find a link relation that uses the curies
                    if (key.startswith(curie['name'] + ':')):

                        #combine the curies & key to make the full resource link
                        newIndex = curie['href']
                        replaceString = key.split(curie['name'] + ':',1)[1]
                        newIndex = re.sub(r"\{.*\}", replaceString, newIndex)

                        #move the resource to the full resource link
                        json['_links'][newIndex] = json['_links'][key]
                        del json['_links'][key]
                        log.debug( 'CURIES: %s moved to %s', key, newIndex )

            #delete curies section of json if desired
            if del_curies:
                del json['_links']['curies']
                log.debug( 'CURIES: CURIES Resource applied fully & removed.' )

        except:
            log.warn( "CURIES: No CURIES found" )

        return json


    @staticmethod
    def pluralize_resource_name(resource_name, namespace=""):
        return [namespace + resource_name + 's', namespace + resource_name + 'es']


    def flatten_filter_link_array(self, req_links):
        ''' takes a JSON array (after CURIES have been applied, if desired)
        and handles HAL 'items' collections and other links, by flattening
        them into a list.  each list element has list[0][fields] fields='href'
        (the actual crawlable link), 'type' (a link associated with the type
        at the other end of the link), 'from_item_list' (true if the resource
        was part of the item collection), and 'title' (a unique name for the
        resource on the other end of the link.

        'from_item_list' is required because collections inherit the type from
        the link above them, which is likely plural, even though they themselves
        are singular.  There is no generalizable way to go from a plural resource
        name to a singular one.  As such, 'from_item_list' tells us to accept the
        pluralized version of the type as indicitive of the found resource.
        '''
        crawl_links=[]

        #formulate and push link items to crawl_links array from json
        for key, item in req_links.iteritems():

            #first handle 'item' links
            if key == 'items':
                for items_item in item:
                    #inherit 'type' from previous crawl step
                    try:
                        items_item['type'] = self.current_uri_type
                    except:
                        log.error('Cannot inherit type information of list from previous crawl')
                        items_item['type'] = 'UNKNOWN'
                    items_item['from_item_list'] = True
                    crawl_links.append(items_item)

            #now filter out links we don't want and push the rest
            elif not any(substring in key.lower() for substring in \
                    self.filter_keywords):
                if item is not None:
                    item['type']=key
                    item['from_item_list'] = False
                    crawl_links.append(item)
                else:
                    log.warn(' EXTRACT_LINK: nonetype link detected in' + \
                            ' resource %s', key)

        return crawl_links


    def get_external_links(self, req_links):

        #call 'real' function, which (1) flattens 'items', (2) filters out
        #create/edit forms, websockets, curies, and self, and (3) formats
        #things nicely for us in an array:
        crawl_links = self.flatten_filter_link_array(req_links)

        #we now have a well-structured list of links with known types
        #before returning, delete any list items that are in our crawl history
        crawl_links = [x for x in crawl_links if x not in (y['href'] for y in self.crawl_history.asList())]

        #for our final list, append info on whether links are in cache
        for link in crawl_links:
            link['in_cache'] = self.cache.check(link['href'])

        return crawl_links


    def query_link_array(self, crawl_links):
        '''takes a crawl_link array (which has links and types of objects)
        and decides which of these links were quieried for. Return List of
        URIs that are matched resources not in the set already discovered'''

        if self.qry_resource_type is not None:
            log.info('SEARCH_LIST: looking for singular: %s', self.qry_resource_type)
            log.info('SEARCH_LIST: looking for plural as item_list: %s', self.qry_resource_plural)
        if self.qry_resource_title is not None:
            log.info('SEARCH_LIST: looking for title: %s', self.qry_resource_title)

        matching_uris = []

        #(1) if resource name exists, filter items to get only items that
        #match the singular resource name, AND (things that match the plural
        #resource name && are from_item_list)
        #(2) if title exists, filter items remaining for those that match the title

        for link_item in crawl_links:

            log.debug('SEARCH_LIST: checking if %s matches query criteria', link_item['href'])
            this_link_item_matches = True

            #see if it matches resource_type, if queried for
            if self.qry_resource_type is not None:
                if ((any(link_item['type'].lower() in x for x in self.qry_resource_plural) and link_item['from_item_list']) \
                        or (link_item['type'].lower() == self.qry_resource_type)):
                    #it does!
                    log.info('SEARCH_LIST: matched search_type %s', link_item['type'])
                else:
                    #it doesn't, but we're searching on resource_type
                    this_link_item_matches = False

            #see if it matches resource_title, if queried for
            if self.qry_resource_title is not None:
                if (link_item['title'].lower() == self.qry_resource_title):
                    #it does!
                    log.info('SEARCH_LIST: matched search_title %s', link_item['title'])
                else:
                    #it doesn't, but we're searching on resource_title
                    this_link_item_matches = False

            #if we made it to here and this_link_item_matches, it's a match!
            if this_link_item_matches:
                matching_uris.append(link_item['href'])

        #return list of matching uris
        return matching_uris


    def query_current_node(self, json):

        matching_uris = []

        if self.qry_resource_type is not None:
            log.info('SEARCH_LIST: looking for singular: %s', self.qry_resource_type)
        if self.qry_resource_title is not None:
            log.info('SEARCH_LIST: looking for title: %s', self.qry_resource_title)
        if self.qry_extra is not None:
            log.info('SEARCH_LIST: looking for %s', self.qry_extra)

        this_link_item_matches = True

        if self.qry_resource_type is not None:
            if (any(self.current_uri_type.lower() in x for x in self.qry_resource_plural) \
                    or self.current_uri_type.lower() == self.qry_resource_type):
                #it does!
                log.info('SEARCH_LIST: matched search_type %s', self.current_uri_type)
            else:
                #it doesn't, but we're searching on resource_type
                this_link_item_matches = False

        #see if it matches resource_title, if queried for
        if self.qry_resource_title is not None:
            if (self.current_uri_title.lower() == self.qry_resource_title):
                #it does!
                log.info('SEARCH_LIST: matched search_title %s', self.current_uri_title)
            else:
                #it doesn't, but we're searching on resource_title
                this_link_item_matches = False

        if self.qry_extra is not None:
            for key, val in self.qry_extra.iteritems():
                try:
                    actual_val = json[key]
                    if actual_val == val:
                        log.info('SEARCH_LIST: matched search_extra %s: %s', key, val)
                    else:
                        this_link_item_matches = False
                except:
                    this_link_item_matches = False

        #if we made it to here and this_link_item_matches, it's a match!
        if this_link_item_matches:
            matching_uris.append(self.current_uri)

        #return list of matching uris
        return matching_uris


    def push_uris_to_queue(self, uris):
        '''check uris against found_resources set, and if they're not there,
        get resource and push URI and resource out to queue'''
        #self.found_resources

        found_one = False

        for uri in uris:
            #if 'add' returns true, it's not in our set yet
            if self.found_resources.add(uri):

                log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
                log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
                log.info('New Resource Found!  %s', uri)
                log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
                log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')

                found_one = True

                #push uri and resource to queue!
                if isinstance(self.q, Queue.Queue):
                    log.info('QUEUE: Pushing to queue')
                    self.q.put(uri)
                elif self.zmq is not None:
                    log.info('QUEUE: Pusing to ZMQ socket')
                    self.zmq.send_string(uri)
                else:
                    log.warn('QUEUE: Queue and ZMQ Socket undefined')

        return found_one


    def crawl_thread(self, q=None, namespace="", resource_type=None, \
            plural_resource_type=None, resource_title=None, resource_extra=None):
        '''
        q is a link to the queue you'd like URIs of found resources pushed to.
        '''
        if q is not None:
            self.q = q

        kwargs = {}
        kwargs['namespace'] = namespace

        if resource_type is not None:
            kwargs['resource_type'] = resource_type
        if plural_resource_type is not None:
            kwargs['plural_resource_type'] = plural_resource_type
        if resource_title is not None:
            kwargs['resource_title'] = resource_title
        if resource_extra is not None:
            kwargs['resource_extra'] = resource_extra

        self.thread = threading.Thread(target=self.crawl, kwargs=kwargs)

        self.thread.daemon = True
        self.thread.setDaemon(True)
        self.thread.start()


    def crawl_zmq(self, socket="tcp://127.0.0.1:5557", namespace="", resource_type=None, \
            plural_resource_type=None, resource_title=None, resource_extra=None):
        '''
        socket is a link to the queue you'd like URIs of found resources pushed to.
        '''
        context = zmq.Context()
        self.zmq = context.socket(zmq.PUSH)
        self.zmq.bind(socket)

        self.crawl(namespace,resource_type,plural_resource_type,resource_title, resource_extra)


    def crawl(self, namespace="", resource_type=None, \
            plural_resource_type=None, resource_title=None, resource_extra=None):
        '''
        crawl through chain, pushing uri/resource that match the passed criteria
        onto the queue.  If nothing is passed, push all resources.

        Can match the resource_type.  If you want a resource list (plural, i.e.
        lists of organizations resources NOT organization resources), you can
        specify that as the resource_type even though it is the plural.

        The code assumes the word can be pluralized by adding an 's' or 'es' to
        the end.  If this is not true (i.e. Person -> People) please give the
        plural so the code can recognize when it has found a list of the
        singular resource of interest.

        if looking for a specific resource, this will cross check against the
        title of the resource.  Selection will be ANDED with other query
        criteria.
        '''

        #store search criteria in lowercase form, with namespace appended
        #add plural forms +'s', +'es' to list of plural cases to look for

        if resource_type is not None:
            #append namespace
            self.qry_resource_type = namespace + resource_type
            #make all lowercase
            self.qry_resource_type = self.qry_resource_type.lower()
            #'pluralize' resource after adding namespace
            self.qry_resource_plural = self.pluralize_resource_name(self.qry_resource_type)
            #add special pluralization if given by user
            if plural_resource_type is not None:
                self.qry_resource_plural.append(namespace + plural_resource_type)
            #make all plural list items lowercase
            self.qry_resource_plural = [x.lower() for x in self.qry_resource_plural]
        else:
            #not searching on resource_type, just define qry_resource_type as None
            self.qry_resource_type = None

        if resource_title is not None:
            #make all lowercase
            self.qry_resource_title = resource_title.lower()
        else:
            #not searching on title, just define qry_resource_title as None
            self.qry_resource_title = None

        if resource_extra is not None:
            self.qry_extra = resource_extra
        else:
            self.qry_extra = None

        #end initializing query variables

        loop_count=0

        #keep calling crawl_node, unless it returns false, with a pause between
        while(self.crawl_node()):

            #delay for crawl_delay ms between calls
            time.sleep(self.crawl_delay/1000.0)

            #count loop iterations
            loop_count = loop_count + 1
            log.info( "MAIN CRAWL LOOP ITERATION %s -----------------", loop_count )

        log.info( "--- crawling ended, %s pages crawled ---", loop_count )

        return self.found_resources


    def crawl_node(self):

        #put uri in cache now that we're crawling it, make a note of collisions
        if self.cache.put_and_collision(self.current_uri):
            log.info( 'HASH COLLISION: value overwritten in hash table.' )

        #debug: print state of cache after updating
        log.debug('CACHE STATE: %s', self.cache._cache)

        #download the current resource
        try:
            req = requests.get(self.current_uri)
            log.info( '%s downloaded.', self.current_uri )

        #downloading the current resource failed
        except requests.exceptions.ConnectionError:

            log.warn( 'URI "%s" unresponsive, moving back to previous link...',\
                    self.current_uri )

            #if we failed to download the entry point, give up
            if self.current_uri == self.entry_point:
                log.error( 'URI is entry point, no previous link.  Try again when' \
                        + ' the entry point URI is available.' )
                return False

            #if it wasn't the entry point, go back in our search history
            try:
                prev = self.crawl_history.pop()
                self.current_uri = prev['href']
                self.current_uri_type = prev['type']
                self.current_uri_title = prev['title']
                return True

            #if we don't have any history left, go back to the entry point
            except:
                log.info( 'exhausted depth of search history, back to entry point' )
                self.current_uri = self.entry_point
                self.current_uri_type = "entry_point"
                self.current_uri_title = "entry_point"
                return True

        #end downloading resource

        #put request in JSON form, apply CURIES, get links
        resource_json = req.json()
        log.debug('HAL/JSON RAW RESOURCE: %s', resource_json)

        req_links = self.apply_hal_curies(resource_json)['_links']
        crawl_links = self.get_external_links(req_links)

        #crawl_links is a 'flat' list list[:][fields]
        #fields are href, type, title, in_cache, from_item_list

        log.debug('HAL/JSON LINKS CURIES APPLIED, FILTERED (for history,' + \
                'self, create/edit, ws, itemlist flattened): %s', crawl_links)

        #find the uris/resources that match search criteria!
        if self.qry_extra is None:
            #we don't need to actually download the link to see if it matches
            matching_uris = self.query_link_array(crawl_links)
        else:
            #we only have enough information to tell if the current node matches
            matching_uris = self.query_current_node(resource_json)

        #... and send them out!!
        if (self.push_uris_to_queue(matching_uris) and self.find_called):
            return False #end crawl if we found one and 'find' was called

        #select next link!!!!

        #get uncached links
        uncached_links = [x for x in crawl_links if not x['in_cache']]
        log.info('CRAWL: %s LINKS UNCACHED OF %s LINKS FOUND', \
                len(uncached_links), len(crawl_links) )

        if (len(uncached_links)>0):
            #we have uncached link(s) to follow! randomly pick one.
            random_index = random.randrange(0,len(uncached_links))

            self.crawl_history.push({'href':self.current_uri, 'type':self.current_uri_type, 'title':self.current_uri_title})
            self.current_uri = uncached_links[random_index]['href']
            self.current_uri_type = uncached_links[random_index]['type']
            self.current_uri_title = uncached_links[random_index]['title']

        else:
            #we don't have any uncached options from this node. Damn.
            log.info('CRAWL: no new links available here, crawling back up history')

            #special case of being at the entry point
            if (self.current_uri_type == 'entry_point'):
                #double check we have something to crawl
                if (len(crawl_links) > 0):

                    log.info('CRAWL: no uncached links from entrypoint, resetting cache')
                    self.cache.clear() # clear cache

                    #randomly select node from crawl_links
                    random_index = random.randrange(0,len(crawl_links))

                    self.crawl_history.push({'href':self.current_uri, 'type':self.current_uri_type, 'title':self.current_uri_title})
                    self.current_uri = crawl_links[random_index]['href']
                    self.current_uri_type = crawl_links[random_index]['type']
                    self.current_uri_title = crawl_links[random_index]['title']

                else:
                    log.error('CRAWL: NO CRAWLABLE LINKS DETECTED AT ENTRY_POINT!!!!')
                    return False

            #not at entry point, time to try and move back up in history
            try:
                prev = self.crawl_history.pop()
                self.current_uri = prev['href']
                self.current_uri_type = prev['type']
                self.current_uri_title = prev['title']

            except: #no history left, not at entry point- jump to entry point
                log.info('CRAWL: crawling back up history, but exhausted history.  Jump to entrypoint.')
                self.current_uri= self.entry_point
                self.current_uri_type = 'entry_point'
                self.current_uri_title = 'entry_point'

        log.debug('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
        log.info('CRAWL: crawling to %s : %s', self.current_uri_title.upper(), self.current_uri)
        log.info('CRAWL: type: %s', self.current_uri_type)
        log.debug('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')

        #recurse
        return True


    def find(self, namespace="", resource_type=None, \
            plural_resource_type=None, resource_title=None, resource_extra=None):
        '''crawls, and when finds a match returns it immediately'''

        self.find_called = True

        uris= self.crawl(namespace=namespace, resource_type=resource_type, \
            plural_resource_type=plural_resource_type, resource_title=resource_title, resource_extra=resource_extra)

        if uris.size() >= 1:
            return uris.asList()[0]
        else:
            return None


if __name__=="__main__":


    #######JUST CRAWL EXAMPLES######

    #crawler = ChainCrawler('http://learnair.media.mit.edu:8000/devices/10')
    #crawler = ChainCrawler('http://learnair.media.mit.edu:8000/devices/?site_id=1')
    #crawler = ChainCrawler(found_set_persistence=2, crawl_delay=500)
    crawler = ChainCrawler()


    crawler.crawl(namespace='http://learnair.media.mit.edu:8000/rels/', \
            resource_type='sensor', resource_extra={'sensor_type':'AlphasenseO3-A4'})
    #crawler.crawl(namespace='http://learnair.media.mit.edu:8000/rels/', \
    #        resource_title='a')
    #crawler.crawl(namespace='http://learnair.media.mit.edu:8000/rels/', \
    #        resource_type='Device', \
    #        resource_title='test004')
    crawler.crawl()


    #######THREADING QUEUE EXAMPLES######

    #testQueue = Queue.Queue()
    #crawler = ChainCrawler(found_set_persistence=2, crawl_delay=500)

    #crawler.crawl_thread(namespace='http://learnair.media.mit.edu:8000/rels/', \
    #        resource_type='site')
    #crawler.crawl_thread(q=testQueue, namespace='http://learnair.media.mit.edu:8000/rels/', \
    #        resource_title='a')
    #crawler.crawl_thread(namespace='http://learnair.media.mit.edu:8000/rels/', \
    #        resource_type='Device', \
    #        resource_title='test004')
    #crawler.crawl_thread()

    #CAUTION: this main loop doesn't end
    #while True:
    #        uri = testQueue.get()
    #        print uri

    #test Daemon exists on main thread exit
    #time.sleep(5)


    #######ZMQ EXAMPLES######

    #crawler = ChainCrawler(found_set_persistence=2, crawl_delay=500)

    #crawler.crawl_zmq(namespace='http://learnair.media.mit.edu:8000/rels/', \
    #        resource_title='a')

    #######FIND EXAMPLE######
    '''
    crawler = ChainCrawler(found_set_persistence=2, crawl_delay=500)
    x=crawler.find(namespace='http://learnair.media.mit.edu:8000/rels/', \
            resource_title="Test Deployment #2",resource_type='deployment')
    print x
    '''