From 92bf5ff09e75a9847a52c273eab1a5c2cbe16483 Mon Sep 17 00:00:00 2001 From: Jay Guo Date: Tue, 6 May 2025 18:53:04 -0400 Subject: [PATCH 1/2] Filter on formats --- ckanext/dcat/configuration_processors.py | 33 ++++++++ ckanext/dcat/harvesters/_json.py | 18 +++++ ckanext/dcat/harvesters/base.py | 2 + .../tests/test_configuration_processors.py | 77 +++++++++++++++++++ 4 files changed, 130 insertions(+) diff --git a/ckanext/dcat/configuration_processors.py b/ckanext/dcat/configuration_processors.py index 24818357..efe10217 100644 --- a/ckanext/dcat/configuration_processors.py +++ b/ckanext/dcat/configuration_processors.py @@ -471,6 +471,39 @@ def check_config(config_obj): and 'organizations_filter_exclude' in config_obj: raise ValueError('Harvest configuration cannot contain both ' 'organizations_filter_include and organizations_filter_exclude') + for key in ['organizations_filter_include', 'organizations_filter_exclude']: + if key in config_obj: + orgs_list = config_obj[key] + if not isinstance(orgs_list, list): + raise ValueError(f"{key} must be a list of organizations") + if not orgs_list: + raise ValueError(f"{key} cannot be empty") + if not all(isinstance(item, str) for item in orgs_list): + raise ValueError(f"{key} must be a list of strings") + + @staticmethod + def modify_package_dict(package_dict, config, dcat_dict): + pass + + +class FormatFilter(BaseConfigProcessor): + + @staticmethod + def check_config(config_obj): + if 'format_filter_include' in config_obj \ + and 'format_filter_exclude' in config_obj: + raise ValueError('Harvest configuration cannot contain both ' + 'format_filter_include and format_filter_exclude') + for key in ['format_filter_include', 'format_filter_exclude']: + if key in config_obj: + formats_list = config_obj[key] + if not isinstance(formats_list, list): + raise ValueError(f"{key} must be a list of formats") + if not formats_list: + raise ValueError(f"{key} cannot be empty") + if not all(isinstance(item, str) for item in formats_list): + raise ValueError(f"{key} must be a list of strings") + config_obj[key] = [fmt.lower() for fmt in formats_list] @staticmethod def modify_package_dict(package_dict, config, dcat_dict): diff --git a/ckanext/dcat/harvesters/_json.py b/ckanext/dcat/harvesters/_json.py index 60cc990b..45297d5d 100644 --- a/ckanext/dcat/harvesters/_json.py +++ b/ckanext/dcat/harvesters/_json.py @@ -44,6 +44,10 @@ def _get_guids_and_datasets(self, content): org_filter_include = self.config.get('organizations_filter_include', []) org_filter_exclude = self.config.get('organizations_filter_exclude', []) + # Filter in/out datasets from particular organizations + format_filter_include = self.config.get('format_filter_include', []) + format_filter_exclude = self.config.get('format_filter_exclude', []) + if isinstance(doc, list): # Assume a list of datasets datasets = doc @@ -72,6 +76,20 @@ def _get_guids_and_datasets(self, content): if dcat_publisher_name in org_filter_exclude: continue + # Include/exclude dataset based on particular formats + if format_filter_include or format_filter_exclude: + resource_formats = [ + dist.get('format', '').lower() + for dist in dataset.get('distribution', []) + if dist.get('format') + ] + if format_filter_include: + if not any(fmt in resource_formats for fmt in format_filter_include): + continue + elif format_filter_exclude: + if any(fmt in resource_formats for fmt in format_filter_exclude): + continue + as_string = json.dumps(dataset) # Get identifier diff --git a/ckanext/dcat/harvesters/base.py b/ckanext/dcat/harvesters/base.py index 3e316407..66670d56 100644 --- a/ckanext/dcat/harvesters/base.py +++ b/ckanext/dcat/harvesters/base.py @@ -23,6 +23,7 @@ ContactPoint, RemoteGroups, OrganizationFilter, + FormatFilter, ResourceFormatOrder, KeepExistingResources, UploadToDatastore @@ -52,6 +53,7 @@ class DCATHarvester(HarvesterBase): ContactPoint, RemoteGroups, OrganizationFilter, + FormatFilter, ResourceFormatOrder, KeepExistingResources, UploadToDatastore diff --git a/ckanext/dcat/tests/test_configuration_processors.py b/ckanext/dcat/tests/test_configuration_processors.py index 5adfddf3..23cd2c1a 100644 --- a/ckanext/dcat/tests/test_configuration_processors.py +++ b/ckanext/dcat/tests/test_configuration_processors.py @@ -8,6 +8,7 @@ Publisher, ContactPoint, RemoteGroups, OrganizationFilter, + FormatFilter, ResourceFormatOrder, KeepExistingResources, UploadToDatastore @@ -879,6 +880,82 @@ def test_modify_package_remote_groups(self): assert group_names == ["climate", "science"] +class TestOrganizationFilter: + + processor = OrganizationFilter + + def test_validation_correct_format(self): + config = { + "organizations_filter_include": [ + "California Department of Technology", + "California Health and Human Services Agency", + "California Natural Resources Agency" + ] + } + try: + self.processor.check_config(config) + except ValueError: + assert False + + def test_validation_wrong_format(self): + config = { + "organizations_filter_include": "CDT, CalHHS, CNRA" + } + try: + self.processor.check_config(config) + assert False + except ValueError: + assert True + + +class TestFormatFilter: + + processor = FormatFilter + + def test_validation_correct_format(self): + config = { + "format_filter_include": [ + "CSV", + "GeoJSON" + ] + } + try: + self.processor.check_config(config) + assert config["format_filter_include"] == ["csv", "geojson"] + except ValueError: + assert False + + config = { + "format_filter_exclude": [ + "PDF" + ] + } + try: + self.processor.check_config(config) + assert config["format_filter_exclude"] == ["pdf"] + except ValueError: + assert False + + def test_validation_wrong_format(self): + config = { + "format_filter_include": "CSV, GeoJSON" + } + try: + self.processor.check_config(config) + assert False + except ValueError: + assert True + + config = { + "format_filter_exclude": "PDF" + } + try: + self.processor.check_config(config) + assert False + except ValueError: + assert True + + class TestResourceFormatOrder: processor = ResourceFormatOrder From 80c88336d5aaed54c1f0fecc77b19a9c7e0863ce Mon Sep 17 00:00:00 2001 From: Jay Guo Date: Wed, 7 May 2025 16:13:25 -0400 Subject: [PATCH 2/2] Allow filter on tags --- ckanext/dcat/configuration_processors.py | 33 +++++++-- ckanext/dcat/harvesters/_json.py | 35 ++++++---- ckanext/dcat/harvesters/base.py | 2 + .../tests/test_configuration_processors.py | 68 +++++++++++++++++++ 4 files changed, 119 insertions(+), 19 deletions(-) diff --git a/ckanext/dcat/configuration_processors.py b/ckanext/dcat/configuration_processors.py index efe10217..10597031 100644 --- a/ckanext/dcat/configuration_processors.py +++ b/ckanext/dcat/configuration_processors.py @@ -123,8 +123,8 @@ def check_config(config_obj): if 'default_groups' in config_obj: if not isinstance(config_obj['default_groups'], list): raise ValueError('default_groups must be a *list* of group names/ids') - if config_obj['default_groups'] and not isinstance(config_obj['default_groups'][0], str): - raise ValueError('default_groups must be a list of group names/ids (i.e. strings)') + if not all(isinstance(item, str) for item in config_obj['default_groups']): + raise ValueError('default_groups must be a *list* of group names/ids (i.e. strings)') # Check if default groups exist context = {'model': model, 'user': p.toolkit.c.user} @@ -476,8 +476,6 @@ def check_config(config_obj): orgs_list = config_obj[key] if not isinstance(orgs_list, list): raise ValueError(f"{key} must be a list of organizations") - if not orgs_list: - raise ValueError(f"{key} cannot be empty") if not all(isinstance(item, str) for item in orgs_list): raise ValueError(f"{key} must be a list of strings") @@ -499,8 +497,6 @@ def check_config(config_obj): formats_list = config_obj[key] if not isinstance(formats_list, list): raise ValueError(f"{key} must be a list of formats") - if not formats_list: - raise ValueError(f"{key} cannot be empty") if not all(isinstance(item, str) for item in formats_list): raise ValueError(f"{key} must be a list of strings") config_obj[key] = [fmt.lower() for fmt in formats_list] @@ -510,13 +506,36 @@ def modify_package_dict(package_dict, config, dcat_dict): pass +class TagFilter(BaseConfigProcessor): + + @staticmethod + def check_config(config_obj): + if 'tag_filter_include' in config_obj \ + and 'tag_filter_exclude' in config_obj: + raise ValueError('Harvest configuration cannot contain both ' + 'tag_filter_include and tag_filter_exclude') + for key in ['tag_filter_include', 'tag_filter_exclude']: + if key in config_obj: + tags_list = config_obj[key] + if not isinstance(tags_list, list): + raise ValueError(f"{key} must be a list of tags") + if not all(isinstance(item, str) for item in tags_list): + raise ValueError(f"{key} must be a list of strings") + + @staticmethod + def modify_package_dict(package_dict, config, dcat_dict): + pass + + class ResourceFormatOrder(BaseConfigProcessor): @staticmethod def check_config(config_obj): if 'resource_format_order' in config_obj: if not isinstance(config_obj['resource_format_order'], list): - raise ValueError('Resource format order should be provided as a list of strings') + raise ValueError('resource_format_order must be a list of strings') + if not all(isinstance(item, str) for item in config_obj['resource_format_order']): + raise ValueError('resource_format_order must be a list of strings') @staticmethod def modify_package_dict(package_dict, config_obj, dcat_dict): diff --git a/ckanext/dcat/harvesters/_json.py b/ckanext/dcat/harvesters/_json.py index 45297d5d..01151dd1 100644 --- a/ckanext/dcat/harvesters/_json.py +++ b/ckanext/dcat/harvesters/_json.py @@ -40,14 +40,6 @@ def _get_guids_and_datasets(self, content): # Raise custom exception which adds context raise JSONDecodeErrorContext(e.msg, e.doc, e.pos) from e - # Filter in/out datasets from particular organizations - org_filter_include = self.config.get('organizations_filter_include', []) - org_filter_exclude = self.config.get('organizations_filter_exclude', []) - - # Filter in/out datasets from particular organizations - format_filter_include = self.config.get('format_filter_include', []) - format_filter_exclude = self.config.get('format_filter_exclude', []) - if isinstance(doc, list): # Assume a list of datasets datasets = doc @@ -56,6 +48,18 @@ def _get_guids_and_datasets(self, content): else: raise ValueError('Wrong JSON object') + # Filter in/out datasets from particular organizations + org_filter_include = self.config.get('organizations_filter_include', []) + org_filter_exclude = self.config.get('organizations_filter_exclude', []) + + # Filter in/out datasets with particular formats + format_filter_include = self.config.get('format_filter_include', []) + format_filter_exclude = self.config.get('format_filter_exclude', []) + + # Filter in/out datasets with particular tags + tag_filter_include = self.config.get('tag_filter_include', []) + tag_filter_exclude = self.config.get('tag_filter_exclude', []) + for dataset in datasets: # Get the organization name for the dataset dcat_publisher = dataset.get('publisher') @@ -90,19 +94,26 @@ def _get_guids_and_datasets(self, content): if any(fmt in resource_formats for fmt in format_filter_exclude): continue + # Include/exclude dataset based on particular tags + if tag_filter_include: + if not any(tag in dataset.get('keyword', []) for tag in tag_filter_include): + continue + elif tag_filter_exclude: + if any(tag in dataset.get('keyword', []) for tag in tag_filter_exclude): + continue + as_string = json.dumps(dataset) # Get identifier guid = dataset.get('identifier') + if not guid: + # This is bad, any ideas welcomed + guid = sha1(as_string.encode('utf-8')).hexdigest() if self.config.get('parse_id_if_url'): # Get id from identifier if it is a url guid = utils.parse_identifier(dataset.get('identifier')) - if not guid: - # This is bad, any ideas welcomed - guid = sha1(as_string.encode('utf-8')).hexdigest() - yield guid, as_string def _get_package_dict(self, harvest_object): diff --git a/ckanext/dcat/harvesters/base.py b/ckanext/dcat/harvesters/base.py index 66670d56..6dd8b8da 100644 --- a/ckanext/dcat/harvesters/base.py +++ b/ckanext/dcat/harvesters/base.py @@ -24,6 +24,7 @@ RemoteGroups, OrganizationFilter, FormatFilter, + TagFilter, ResourceFormatOrder, KeepExistingResources, UploadToDatastore @@ -54,6 +55,7 @@ class DCATHarvester(HarvesterBase): RemoteGroups, OrganizationFilter, FormatFilter, + TagFilter, ResourceFormatOrder, KeepExistingResources, UploadToDatastore diff --git a/ckanext/dcat/tests/test_configuration_processors.py b/ckanext/dcat/tests/test_configuration_processors.py index 23cd2c1a..1c751621 100644 --- a/ckanext/dcat/tests/test_configuration_processors.py +++ b/ckanext/dcat/tests/test_configuration_processors.py @@ -9,6 +9,7 @@ RemoteGroups, OrganizationFilter, FormatFilter, + TagFilter, ResourceFormatOrder, KeepExistingResources, UploadToDatastore @@ -897,6 +898,17 @@ def test_validation_correct_format(self): except ValueError: assert False + config = { + "organizations_filter_exclude": [ + "OEHHA ArcGIS Online", + "DTSC_Admin" + ] + } + try: + self.processor.check_config(config) + except ValueError: + assert False + def test_validation_wrong_format(self): config = { "organizations_filter_include": "CDT, CalHHS, CNRA" @@ -906,6 +918,15 @@ def test_validation_wrong_format(self): assert False except ValueError: assert True + + config = { + "organizations_filter_exclude": "OEHHA ArcGIS Online, DTSC_Admin" + } + try: + self.processor.check_config(config) + assert False + except ValueError: + assert True class TestFormatFilter: @@ -956,6 +977,53 @@ def test_validation_wrong_format(self): assert True +class TestTagFilter: + + processor = TagFilter + + def test_validation_correct_format(self): + config = { + "tag_filter_include": [ + "Climate", + "Water" + ] + } + try: + self.processor.check_config(config) + except ValueError: + assert False + + config = { + "tag_filter_exclude": [ + "Application", + "Software" + ] + } + try: + self.processor.check_config(config) + except ValueError: + assert False + + def test_validation_wrong_format(self): + config = { + "tag_filter_include": "Climate, Water" + } + try: + self.processor.check_config(config) + assert False + except ValueError: + assert True + + config = { + "tag_filter_exclude": "Application, Software" + } + try: + self.processor.check_config(config) + assert False + except ValueError: + assert True + + class TestResourceFormatOrder: processor = ResourceFormatOrder