Commit c4b981ab authored by Vilem Ded's avatar Vilem Ded
Browse files

Merge branch 'add-support-dish-v10' into 'master'

DISH v.10

See merge request elixir/metadata-tools!20
parents e17b95d1 298cd8f7
......@@ -9,6 +9,15 @@ git clone <repository-url>
git submodule update --init
```
## Versions of DISH
Tags are used for keeping track of which code version supported which DISH version.
Old DISH exporter can produce JSON which is not compatible with newest JSON schemas or Daisy importer. In this case, migration of the DISH is the safest procedure.
Versions:
- v.8 -> v.9 - no schema breaking change, update of one text field
- v.9 -> v.10 - no schema breaking change, just more data use restrictions are collected
## Development environment setup
Create virtual environment:
......
......@@ -29,6 +29,14 @@ parser.add_argument(
dest="output_dir"
)
parser.add_argument(
'--skip-version-validation',
help='',
dest="skip_version_validation",
action='store_true'
)
args = parser.parse_args()
if not (args.dirname or args.file):
parser.error('No arguments provided. Either -d or -f must be specified')
......@@ -54,7 +62,7 @@ exporter = DishXlsExporter()
counter = 0
for fname in fileList:
try:
dataset_dict = exporter.export_submission(fname)
dataset_dict = exporter.export_submission(fname, args.skip_version_validation)
except Exception as e:
raise Exception(f'Error occured in file {fname}') from e
......
......@@ -2,7 +2,7 @@ import logging
import pyexcel
import hashlib
from os import fsencode
from .export_utils import get_partners_from_daisy, get_email_from_string, process_possible_date, process_yes_no_dontknow_answer, get_value_list_from_row, is_data, is_study, is_submission, process_yes_no_answer, get_names_from_string
from .export_utils import validate_dataset_dict, get_partners_from_daisy, get_email_from_string, process_possible_date, process_yes_no_dontknow_answer, get_value_list_from_row, is_data, is_study, is_submission, process_yes_no_answer, get_names_from_string
from .validators import DISHVersionValidator
class DishXlsExporter:
......@@ -48,14 +48,15 @@ class DishXlsExporter:
"Other"
])
def export_submission(self, full_file_path):
def export_submission(self, full_file_path, skip_version_validation=False):
idx = 1
logging.info('Processing start for ----> {}'.format(full_file_path))
book = pyexcel.get_book(file_name=full_file_path, skip_hidden_sheets = False)
is_dish = any("_Help" in elem for elem in book.sheet_names())
version_validator = DISHVersionValidator()
version_validator.validate_book(book)
if not skip_version_validation:
version_validator = DISHVersionValidator()
version_validator.validate_book(book)
if is_dish:
dataset_dict = {
......@@ -85,7 +86,7 @@ class DishXlsExporter:
pass
idx += 1
validate_dataset_dict(dataset_dict)
logging.info('Processing end for ----> {}'.format(full_file_path))
return dataset_dict
else:
......@@ -207,63 +208,97 @@ class DishXlsExporter:
'use_restriction_rule': "PERMISSION",
'use_class_note': sheet[35, 0]})
# Is the data being sent to ELIXIR-LU/LCSB for a limited duration?
# Does the limitation to the RESEARCH PROJECT include the Research Use (as defined in the Consortium Agreement)?
if process_yes_no_answer(sheet[36, 1]):
datadec_dict["storage_end_date"] = process_possible_date(sheet[37, 1])
use_restrictions.append({'use_class': 'PS',
'use_restriction_rule': "CONSTRAINED_PERMISSION",
'use_class_note': sheet[36, 0],
'use_restriction_note': "Limitation to the reseearch project includes the Reserach Use."})
else:
use_restrictions.append({'use_class': 'PS',
'use_restriction_rule': "PERMISSION",
'use_class_note': sheet[36, 0]})
# Is the data being sent to ELIXIR-LU/LCSB for a limited duration?
if process_yes_no_answer(sheet[37, 1]):
datadec_dict["storage_end_date"] = process_possible_date(sheet[38, 1])
use_restrictions.append({'use_class': 'TS-[XX]',
'use_restriction_rule': "OBLIGATION",
'use_class_note': sheet[36, 0],
'use_restriction_note': process_possible_date(sheet[37, 1])})
'use_class_note': sheet[37, 0],
'use_restriction_note': process_possible_date(sheet[38, 1])})
else:
use_restrictions.append({'use_class': 'TS-[XX]',
'use_restriction_rule': "PERMISSION",
'use_class_note': sheet[36, 0]})
'use_class_note': sheet[37, 0]})
# Are there any requirements in case of publications based on the DATA?
if process_yes_no_answer(sheet[38, 1]):
if process_yes_no_answer(sheet[39, 1]):
use_restrictions.append({'use_class': 'PUB',
'use_restriction_rule': "OBLIGATION",
'use_class_note': sheet[38, 0],
'use_restriction_note': sheet[39, 1]})
'use_class_note': sheet[39, 0],
'use_restriction_note': sheet[40, 1]})
else:
use_restrictions.append({'use_class': 'PUB',
'use_restriction_rule': "PERMISSION",
'use_class_note': sheet[38, 0]})
'use_class_note': sheet[39, 0]})
# Is there a requirement to return data or documents to the database/resource?
if process_yes_no_answer(sheet[40, 1]):
if process_yes_no_answer(sheet[41, 1]):
use_restrictions.append({'use_class': 'RTN',
'use_restriction_rule': "OBLIGATION",
'use_class_note': sheet[40, 0],
'use_restriction_note': sheet[41, 1]})
'use_class_note': sheet[41, 0],
'use_restriction_note': sheet[42, 1]})
else:
use_restrictions.append({'use_class': 'RTN',
'use_restriction_rule': "PERMISSION",
'use_class_note': sheet[40, 0]})
'use_class_note': sheet[41, 0]})
# Is the use limited to approved users/groups/institutions?
if process_yes_no_answer(sheet[43, 1]):
use_restrictions.append({'use_class': 'US',
'use_restriction_rule': "CONSTRAINED_PERMISSION",
'use_class_note': sheet[43, 0],
'use_restriction_note': sheet[44, 1]})
else:
use_restrictions.append({'use_class': 'US',
'use_restriction_rule': "PERMISSION",
'use_class_note': sheet[43, 1]})
if sheet[42, 1]:
# If there are any other restrictions on DATA, please describe them here. If applicable, in your description you may refer to GA4GH Data Use Category Codes, found at the below link.
if sheet[45, 1]:
use_restrictions.append({'use_class': 'Other',
'use_restriction_rule': "CONSTRAINED_PERMISSION",
'use_class_note': sheet[42, 0],
'use_restriction_note': sheet[42, 1]})
'use_class_note': sheet[45, 0],
'use_restriction_note': sheet[45, 1]})
# Are there any IP restrictions/requirements when using the DATA?
# Is the use limited to approved users/groups/institutions?
if process_yes_no_answer(sheet[47, 1]):
use_restrictions.append({'use_class': 'IP',
use_restrictions.append({'use_class': 'US',
'use_restriction_rule': "CONSTRAINED_PERMISSION",
'use_class_note': sheet[47, 0],
'use_restriction_note': sheet[48, 1]})
else:
use_restrictions.append({'use_class': 'IP',
use_restrictions.append({'use_class': 'US',
'use_restriction_rule': "PERMISSION",
'use_class_note': sheet[47, 1]})
# Are there any IP restrictions/requirements when using the DATA?
if process_yes_no_answer(sheet[50, 1]):
use_restrictions.append({'use_class': 'IP',
'use_restriction_rule': "CONSTRAINED_PERMISSION",
'use_class_note': sheet[50, 0],
'use_restriction_note': sheet[51, 1]})
else:
use_restrictions.append({'use_class': 'IP',
'use_restriction_rule': "PERMISSION",
'use_class_note': sheet[50, 1]})
datadec_dict['use_restrictions'] = use_restrictions
datadec_dict["access_procedure"] = ""
if sheet[45, 1] and ('not' in sheet[45, 1]):
if sheet[44, 1] and ('no' in sheet[44, 1]):
if sheet[47, 1] and ('not' in sheet[47, 1]):
if sheet[49, 1] and ('no' in sheet[49, 1]):
datadec_dict["access_category"] = "open_access" #this is just an initial interpretation and should be further curated in catalog
datadec_dict["access_procedure"] = datadec_dict["access_procedure"] + "No additional form is needed to request access."
else:
......@@ -271,7 +306,7 @@ class DishXlsExporter:
datadec_dict["access_procedure"] = datadec_dict["access_procedure"] + "Additional form is needed to request access."
else:
datadec_dict["access_category"] = "controlled_access"
datadec_dict["access_procedure"] = datadec_dict["access_procedure"] + sheet[46, 1]
datadec_dict["access_procedure"] = datadec_dict["access_procedure"] + sheet[49, 1]
dataset_dict["data_declarations"].append(datadec_dict)
......
......@@ -161,6 +161,15 @@ def get_partners_from_daisy():
return json.loads(entities_json_str)
def validate_dataset_dict(dataset_dict) -> bool:
# check that study exists for all data declarations
for dd in dataset_dict['data_declarations']:
study = dd['source_study']
if study not in [study['name'] for study in dataset_dict['studies']]:
dd_name = dd.get('title')
logging.warning(f'Data declaration \'{dd_name}\' has no matching study \'{study}\'')
def save_exported_datasets_to_file(exported_dataset, output_file):
if isinstance(exported_dataset, list):
......
......@@ -4,7 +4,7 @@ from typing import Text
class DISHVersionValidator():
_supported_version = 'v.8'
_supported_version = 'v.10'
def get_supported_version(self):
return self._supported_version
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment