Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 55 additions & 3 deletions ckanext/dcat/configuration_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,8 @@ def check_config(config_obj):
if 'default_groups' in config_obj:
if not isinstance(config_obj['default_groups'], list):
raise ValueError('default_groups must be a *list* of group names/ids')
if config_obj['default_groups'] and not isinstance(config_obj['default_groups'][0], str):
raise ValueError('default_groups must be a list of group names/ids (i.e. strings)')
if not all(isinstance(item, str) for item in config_obj['default_groups']):
raise ValueError('default_groups must be a *list* of group names/ids (i.e. strings)')

# Check if default groups exist
context = {'model': model, 'user': p.toolkit.c.user}
Expand Down Expand Up @@ -471,6 +471,56 @@ def check_config(config_obj):
and 'organizations_filter_exclude' in config_obj:
raise ValueError('Harvest configuration cannot contain both '
'organizations_filter_include and organizations_filter_exclude')
for key in ['organizations_filter_include', 'organizations_filter_exclude']:
if key in config_obj:
orgs_list = config_obj[key]
if not isinstance(orgs_list, list):
raise ValueError(f"{key} must be a list of organizations")
if not all(isinstance(item, str) for item in orgs_list):
raise ValueError(f"{key} must be a list of strings")

@staticmethod
def modify_package_dict(package_dict, config, dcat_dict):
pass


class FormatFilter(BaseConfigProcessor):

@staticmethod
def check_config(config_obj):
if 'format_filter_include' in config_obj \
and 'format_filter_exclude' in config_obj:
raise ValueError('Harvest configuration cannot contain both '
'format_filter_include and format_filter_exclude')
for key in ['format_filter_include', 'format_filter_exclude']:
if key in config_obj:
formats_list = config_obj[key]
if not isinstance(formats_list, list):
raise ValueError(f"{key} must be a list of formats")
if not all(isinstance(item, str) for item in formats_list):
raise ValueError(f"{key} must be a list of strings")
config_obj[key] = [fmt.lower() for fmt in formats_list]

@staticmethod
def modify_package_dict(package_dict, config, dcat_dict):
pass


class TagFilter(BaseConfigProcessor):

@staticmethod
def check_config(config_obj):
if 'tag_filter_include' in config_obj \
and 'tag_filter_exclude' in config_obj:
raise ValueError('Harvest configuration cannot contain both '
'tag_filter_include and tag_filter_exclude')
for key in ['tag_filter_include', 'tag_filter_exclude']:
if key in config_obj:
tags_list = config_obj[key]
if not isinstance(tags_list, list):
raise ValueError(f"{key} must be a list of tags")
if not all(isinstance(item, str) for item in tags_list):
raise ValueError(f"{key} must be a list of strings")

@staticmethod
def modify_package_dict(package_dict, config, dcat_dict):
Expand All @@ -483,7 +533,9 @@ class ResourceFormatOrder(BaseConfigProcessor):
def check_config(config_obj):
if 'resource_format_order' in config_obj:
if not isinstance(config_obj['resource_format_order'], list):
raise ValueError('Resource format order should be provided as a list of strings')
raise ValueError('resource_format_order must be a list of strings')
if not all(isinstance(item, str) for item in config_obj['resource_format_order']):
raise ValueError('resource_format_order must be a list of strings')

@staticmethod
def modify_package_dict(package_dict, config_obj, dcat_dict):
Expand Down
45 changes: 37 additions & 8 deletions ckanext/dcat/harvesters/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,6 @@ def _get_guids_and_datasets(self, content):
# Raise custom exception which adds context
raise JSONDecodeErrorContext(e.msg, e.doc, e.pos) from e

# Filter in/out datasets from particular organizations
org_filter_include = self.config.get('organizations_filter_include', [])
org_filter_exclude = self.config.get('organizations_filter_exclude', [])

if isinstance(doc, list):
# Assume a list of datasets
datasets = doc
Expand All @@ -52,6 +48,18 @@ def _get_guids_and_datasets(self, content):
else:
raise ValueError('Wrong JSON object')

# Filter in/out datasets from particular organizations
org_filter_include = self.config.get('organizations_filter_include', [])
org_filter_exclude = self.config.get('organizations_filter_exclude', [])

# Filter in/out datasets with particular formats
format_filter_include = self.config.get('format_filter_include', [])
format_filter_exclude = self.config.get('format_filter_exclude', [])

# Filter in/out datasets with particular tags
tag_filter_include = self.config.get('tag_filter_include', [])
tag_filter_exclude = self.config.get('tag_filter_exclude', [])

for dataset in datasets:
# Get the organization name for the dataset
dcat_publisher = dataset.get('publisher')
Expand All @@ -72,19 +80,40 @@ def _get_guids_and_datasets(self, content):
if dcat_publisher_name in org_filter_exclude:
continue

# Include/exclude dataset based on particular formats
if format_filter_include or format_filter_exclude:
resource_formats = [
dist.get('format', '').lower()
for dist in dataset.get('distribution', [])
if dist.get('format')
]
if format_filter_include:
if not any(fmt in resource_formats for fmt in format_filter_include):
continue
elif format_filter_exclude:
if any(fmt in resource_formats for fmt in format_filter_exclude):
continue

# Include/exclude dataset based on particular tags
if tag_filter_include:
if not any(tag in dataset.get('keyword', []) for tag in tag_filter_include):
continue
elif tag_filter_exclude:
if any(tag in dataset.get('keyword', []) for tag in tag_filter_exclude):
continue

as_string = json.dumps(dataset)

# Get identifier
guid = dataset.get('identifier')
if not guid:
# This is bad, any ideas welcomed
guid = sha1(as_string.encode('utf-8')).hexdigest()

if self.config.get('parse_id_if_url'):
# Get id from identifier if it is a url
guid = utils.parse_identifier(dataset.get('identifier'))

if not guid:
# This is bad, any ideas welcomed
guid = sha1(as_string.encode('utf-8')).hexdigest()

yield guid, as_string

def _get_package_dict(self, harvest_object):
Expand Down
4 changes: 4 additions & 0 deletions ckanext/dcat/harvesters/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
ContactPoint,
RemoteGroups,
OrganizationFilter,
FormatFilter,
TagFilter,
ResourceFormatOrder,
KeepExistingResources,
UploadToDatastore
Expand Down Expand Up @@ -52,6 +54,8 @@ class DCATHarvester(HarvesterBase):
ContactPoint,
RemoteGroups,
OrganizationFilter,
FormatFilter,
TagFilter,
ResourceFormatOrder,
KeepExistingResources,
UploadToDatastore
Expand Down
145 changes: 145 additions & 0 deletions ckanext/dcat/tests/test_configuration_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
Publisher, ContactPoint,
RemoteGroups,
OrganizationFilter,
FormatFilter,
TagFilter,
ResourceFormatOrder,
KeepExistingResources,
UploadToDatastore
Expand Down Expand Up @@ -879,6 +881,149 @@ def test_modify_package_remote_groups(self):
assert group_names == ["climate", "science"]


class TestOrganizationFilter:

processor = OrganizationFilter

def test_validation_correct_format(self):
config = {
"organizations_filter_include": [
"California Department of Technology",
"California Health and Human Services Agency",
"California Natural Resources Agency"
]
}
try:
self.processor.check_config(config)
except ValueError:
assert False

config = {
"organizations_filter_exclude": [
"OEHHA ArcGIS Online",
"DTSC_Admin"
]
}
try:
self.processor.check_config(config)
except ValueError:
assert False

def test_validation_wrong_format(self):
config = {
"organizations_filter_include": "CDT, CalHHS, CNRA"
}
try:
self.processor.check_config(config)
assert False
except ValueError:
assert True

config = {
"organizations_filter_exclude": "OEHHA ArcGIS Online, DTSC_Admin"
}
try:
self.processor.check_config(config)
assert False
except ValueError:
assert True


class TestFormatFilter:

processor = FormatFilter

def test_validation_correct_format(self):
config = {
"format_filter_include": [
"CSV",
"GeoJSON"
]
}
try:
self.processor.check_config(config)
assert config["format_filter_include"] == ["csv", "geojson"]
except ValueError:
assert False

config = {
"format_filter_exclude": [
"PDF"
]
}
try:
self.processor.check_config(config)
assert config["format_filter_exclude"] == ["pdf"]
except ValueError:
assert False

def test_validation_wrong_format(self):
config = {
"format_filter_include": "CSV, GeoJSON"
}
try:
self.processor.check_config(config)
assert False
except ValueError:
assert True

config = {
"format_filter_exclude": "PDF"
}
try:
self.processor.check_config(config)
assert False
except ValueError:
assert True


class TestTagFilter:

processor = TagFilter

def test_validation_correct_format(self):
config = {
"tag_filter_include": [
"Climate",
"Water"
]
}
try:
self.processor.check_config(config)
except ValueError:
assert False

config = {
"tag_filter_exclude": [
"Application",
"Software"
]
}
try:
self.processor.check_config(config)
except ValueError:
assert False

def test_validation_wrong_format(self):
config = {
"tag_filter_include": "Climate, Water"
}
try:
self.processor.check_config(config)
assert False
except ValueError:
assert True

config = {
"tag_filter_exclude": "Application, Software"
}
try:
self.processor.check_config(config)
assert False
except ValueError:
assert True


class TestResourceFormatOrder:

processor = ResourceFormatOrder
Expand Down