diff --git a/ckanext/dcat/configuration_processors.py b/ckanext/dcat/configuration_processors.py index 24818357..10597031 100644 --- a/ckanext/dcat/configuration_processors.py +++ b/ckanext/dcat/configuration_processors.py @@ -123,8 +123,8 @@ def check_config(config_obj): if 'default_groups' in config_obj: if not isinstance(config_obj['default_groups'], list): raise ValueError('default_groups must be a *list* of group names/ids') - if config_obj['default_groups'] and not isinstance(config_obj['default_groups'][0], str): - raise ValueError('default_groups must be a list of group names/ids (i.e. strings)') + if not all(isinstance(item, str) for item in config_obj['default_groups']): + raise ValueError('default_groups must be a *list* of group names/ids (i.e. strings)') # Check if default groups exist context = {'model': model, 'user': p.toolkit.c.user} @@ -471,6 +471,56 @@ def check_config(config_obj): and 'organizations_filter_exclude' in config_obj: raise ValueError('Harvest configuration cannot contain both ' 'organizations_filter_include and organizations_filter_exclude') + for key in ['organizations_filter_include', 'organizations_filter_exclude']: + if key in config_obj: + orgs_list = config_obj[key] + if not isinstance(orgs_list, list): + raise ValueError(f"{key} must be a list of organizations") + if not all(isinstance(item, str) for item in orgs_list): + raise ValueError(f"{key} must be a list of strings") + + @staticmethod + def modify_package_dict(package_dict, config, dcat_dict): + pass + + +class FormatFilter(BaseConfigProcessor): + + @staticmethod + def check_config(config_obj): + if 'format_filter_include' in config_obj \ + and 'format_filter_exclude' in config_obj: + raise ValueError('Harvest configuration cannot contain both ' + 'format_filter_include and format_filter_exclude') + for key in ['format_filter_include', 'format_filter_exclude']: + if key in config_obj: + formats_list = config_obj[key] + if not isinstance(formats_list, list): + raise ValueError(f"{key} must be a list of formats") + if not all(isinstance(item, str) for item in formats_list): + raise ValueError(f"{key} must be a list of strings") + config_obj[key] = [fmt.lower() for fmt in formats_list] + + @staticmethod + def modify_package_dict(package_dict, config, dcat_dict): + pass + + +class TagFilter(BaseConfigProcessor): + + @staticmethod + def check_config(config_obj): + if 'tag_filter_include' in config_obj \ + and 'tag_filter_exclude' in config_obj: + raise ValueError('Harvest configuration cannot contain both ' + 'tag_filter_include and tag_filter_exclude') + for key in ['tag_filter_include', 'tag_filter_exclude']: + if key in config_obj: + tags_list = config_obj[key] + if not isinstance(tags_list, list): + raise ValueError(f"{key} must be a list of tags") + if not all(isinstance(item, str) for item in tags_list): + raise ValueError(f"{key} must be a list of strings") @staticmethod def modify_package_dict(package_dict, config, dcat_dict): @@ -483,7 +533,9 @@ class ResourceFormatOrder(BaseConfigProcessor): def check_config(config_obj): if 'resource_format_order' in config_obj: if not isinstance(config_obj['resource_format_order'], list): - raise ValueError('Resource format order should be provided as a list of strings') + raise ValueError('resource_format_order must be a list of strings') + if not all(isinstance(item, str) for item in config_obj['resource_format_order']): + raise ValueError('resource_format_order must be a list of strings') @staticmethod def modify_package_dict(package_dict, config_obj, dcat_dict): diff --git a/ckanext/dcat/harvesters/_json.py b/ckanext/dcat/harvesters/_json.py index 60cc990b..01151dd1 100644 --- a/ckanext/dcat/harvesters/_json.py +++ b/ckanext/dcat/harvesters/_json.py @@ -40,10 +40,6 @@ def _get_guids_and_datasets(self, content): # Raise custom exception which adds context raise JSONDecodeErrorContext(e.msg, e.doc, e.pos) from e - # Filter in/out datasets from particular organizations - org_filter_include = self.config.get('organizations_filter_include', []) - org_filter_exclude = self.config.get('organizations_filter_exclude', []) - if isinstance(doc, list): # Assume a list of datasets datasets = doc @@ -52,6 +48,18 @@ def _get_guids_and_datasets(self, content): else: raise ValueError('Wrong JSON object') + # Filter in/out datasets from particular organizations + org_filter_include = self.config.get('organizations_filter_include', []) + org_filter_exclude = self.config.get('organizations_filter_exclude', []) + + # Filter in/out datasets with particular formats + format_filter_include = self.config.get('format_filter_include', []) + format_filter_exclude = self.config.get('format_filter_exclude', []) + + # Filter in/out datasets with particular tags + tag_filter_include = self.config.get('tag_filter_include', []) + tag_filter_exclude = self.config.get('tag_filter_exclude', []) + for dataset in datasets: # Get the organization name for the dataset dcat_publisher = dataset.get('publisher') @@ -72,19 +80,40 @@ def _get_guids_and_datasets(self, content): if dcat_publisher_name in org_filter_exclude: continue + # Include/exclude dataset based on particular formats + if format_filter_include or format_filter_exclude: + resource_formats = [ + dist.get('format', '').lower() + for dist in dataset.get('distribution', []) + if dist.get('format') + ] + if format_filter_include: + if not any(fmt in resource_formats for fmt in format_filter_include): + continue + elif format_filter_exclude: + if any(fmt in resource_formats for fmt in format_filter_exclude): + continue + + # Include/exclude dataset based on particular tags + if tag_filter_include: + if not any(tag in dataset.get('keyword', []) for tag in tag_filter_include): + continue + elif tag_filter_exclude: + if any(tag in dataset.get('keyword', []) for tag in tag_filter_exclude): + continue + as_string = json.dumps(dataset) # Get identifier guid = dataset.get('identifier') + if not guid: + # This is bad, any ideas welcomed + guid = sha1(as_string.encode('utf-8')).hexdigest() if self.config.get('parse_id_if_url'): # Get id from identifier if it is a url guid = utils.parse_identifier(dataset.get('identifier')) - if not guid: - # This is bad, any ideas welcomed - guid = sha1(as_string.encode('utf-8')).hexdigest() - yield guid, as_string def _get_package_dict(self, harvest_object): diff --git a/ckanext/dcat/harvesters/base.py b/ckanext/dcat/harvesters/base.py index 3e316407..6dd8b8da 100644 --- a/ckanext/dcat/harvesters/base.py +++ b/ckanext/dcat/harvesters/base.py @@ -23,6 +23,8 @@ ContactPoint, RemoteGroups, OrganizationFilter, + FormatFilter, + TagFilter, ResourceFormatOrder, KeepExistingResources, UploadToDatastore @@ -52,6 +54,8 @@ class DCATHarvester(HarvesterBase): ContactPoint, RemoteGroups, OrganizationFilter, + FormatFilter, + TagFilter, ResourceFormatOrder, KeepExistingResources, UploadToDatastore diff --git a/ckanext/dcat/tests/test_configuration_processors.py b/ckanext/dcat/tests/test_configuration_processors.py index 5adfddf3..1c751621 100644 --- a/ckanext/dcat/tests/test_configuration_processors.py +++ b/ckanext/dcat/tests/test_configuration_processors.py @@ -8,6 +8,8 @@ Publisher, ContactPoint, RemoteGroups, OrganizationFilter, + FormatFilter, + TagFilter, ResourceFormatOrder, KeepExistingResources, UploadToDatastore @@ -879,6 +881,149 @@ def test_modify_package_remote_groups(self): assert group_names == ["climate", "science"] +class TestOrganizationFilter: + + processor = OrganizationFilter + + def test_validation_correct_format(self): + config = { + "organizations_filter_include": [ + "California Department of Technology", + "California Health and Human Services Agency", + "California Natural Resources Agency" + ] + } + try: + self.processor.check_config(config) + except ValueError: + assert False + + config = { + "organizations_filter_exclude": [ + "OEHHA ArcGIS Online", + "DTSC_Admin" + ] + } + try: + self.processor.check_config(config) + except ValueError: + assert False + + def test_validation_wrong_format(self): + config = { + "organizations_filter_include": "CDT, CalHHS, CNRA" + } + try: + self.processor.check_config(config) + assert False + except ValueError: + assert True + + config = { + "organizations_filter_exclude": "OEHHA ArcGIS Online, DTSC_Admin" + } + try: + self.processor.check_config(config) + assert False + except ValueError: + assert True + + +class TestFormatFilter: + + processor = FormatFilter + + def test_validation_correct_format(self): + config = { + "format_filter_include": [ + "CSV", + "GeoJSON" + ] + } + try: + self.processor.check_config(config) + assert config["format_filter_include"] == ["csv", "geojson"] + except ValueError: + assert False + + config = { + "format_filter_exclude": [ + "PDF" + ] + } + try: + self.processor.check_config(config) + assert config["format_filter_exclude"] == ["pdf"] + except ValueError: + assert False + + def test_validation_wrong_format(self): + config = { + "format_filter_include": "CSV, GeoJSON" + } + try: + self.processor.check_config(config) + assert False + except ValueError: + assert True + + config = { + "format_filter_exclude": "PDF" + } + try: + self.processor.check_config(config) + assert False + except ValueError: + assert True + + +class TestTagFilter: + + processor = TagFilter + + def test_validation_correct_format(self): + config = { + "tag_filter_include": [ + "Climate", + "Water" + ] + } + try: + self.processor.check_config(config) + except ValueError: + assert False + + config = { + "tag_filter_exclude": [ + "Application", + "Software" + ] + } + try: + self.processor.check_config(config) + except ValueError: + assert False + + def test_validation_wrong_format(self): + config = { + "tag_filter_include": "Climate, Water" + } + try: + self.processor.check_config(config) + assert False + except ValueError: + assert True + + config = { + "tag_filter_exclude": "Application, Software" + } + try: + self.processor.check_config(config) + assert False + except ValueError: + assert True + + class TestResourceFormatOrder: processor = ResourceFormatOrder