Commit 23e16ed0 authored by Pinar Alper's avatar Pinar Alper
Browse files

Merge branch 'updates-to-exporter' into 'master'

Updates to exporter

See merge request pinar.alper/metadata-tools!10
parents 3ab1ee8b 00b7b6b8
...@@ -60,7 +60,7 @@ for fname in fileList: ...@@ -60,7 +60,7 @@ for fname in fileList:
fname_json = pathlib.Path(fname).stem + ".json" fname_json = pathlib.Path(fname).stem + ".json"
fname_out = os.path.join(args.output_dir, fname_json) fname_out = os.path.join(args.output_dir, fname_json)
with open(fname_out, 'w') as outfile: with open(fname_out, 'w', encoding='utf-8') as outfile:
save_exported_datasets_to_file(dataset_dict, outfile) save_exported_datasets_to_file(dataset_dict, outfile)
counter += 1 counter += 1
......
...@@ -67,8 +67,8 @@ class DishXlsExporter: ...@@ -67,8 +67,8 @@ class DishXlsExporter:
logging.info('Processing sheet ----> {}'.format(book.sheet_names()[idx])) logging.info('Processing sheet ----> {}'.format(book.sheet_names()[idx]))
if is_study(sheet): if is_study(sheet):
cohort_dict = {'name': sheet[1, 1], cohort_dict = {'name': sheet[1, 1].strip(),
'description' : sheet[2, 1] + ' ' + sheet[6, 1], 'description' : sheet[2, 1] + ' ' + sheet[6, 1] + ' ' + sheet[16,1],
'has_ethics_approval' : process_yes_no_answer(sheet[4, 1]), 'has_ethics_approval' : process_yes_no_answer(sheet[4, 1]),
"ethics_approval_notes": sheet[5, 1], "ethics_approval_notes": sheet[5, 1],
"url": sheet[3, 1], "url": sheet[3, 1],
...@@ -81,7 +81,7 @@ class DishXlsExporter: ...@@ -81,7 +81,7 @@ class DishXlsExporter:
} }
if sheet[12, 1] and sheet[15, 1]: if sheet[12, 1] and sheet[15, 1]:
cohort_dict["contacts"] = cohort_dict["contacts"].append({"first_name": get_names_from_string(sheet[12,1])[0], cohort_dict["contacts"].append({"first_name": get_names_from_string(sheet[12,1])[0],
"last_name": get_names_from_string(sheet[12,1])[1], "last_name": get_names_from_string(sheet[12,1])[1],
"role": sheet[15,1], "role": sheet[15,1],
"email":sheet[13,1], "email":sheet[13,1],
...@@ -90,14 +90,17 @@ class DishXlsExporter: ...@@ -90,14 +90,17 @@ class DishXlsExporter:
dataset_dict["studies"].append(cohort_dict) dataset_dict["studies"].append(cohort_dict)
elif is_data(sheet): elif is_data(sheet):
datadec_dict = {'title' : sheet[1, 1], datadec_dict = {'title' : sheet[1, 1].strip(),
'source_study' : sheet[2, 1], 'source_study' : sheet[2, 1].strip(),
"data_types":[]} "data_types":[]}
datadec_dict["data_type_notes"] = sheet[7, 1] datadec_dict["data_type_notes"] = sheet[7, 1]
data_type_info = self.process_data_types(get_value_list_from_row(sheet, 6)) data_type_info = self.process_data_types(get_value_list_from_row(sheet, 6))
datadec_dict["data_types"].extend(data_type_info[0]) datadec_dict["data_types"].extend(data_type_info[0])
datadec_dict["data_type_notes"] = datadec_dict["data_type_notes"] +" "+ data_type_info[1]+ " Notes on samples: " + sheet[10, 1] if data_type_info[1]:
datadec_dict["data_type_notes"] += " " + data_type_info[1]
if sheet[10, 1]:
datadec_dict["data_type_notes"] += " Notes on samples: " + sheet[10, 1]
#if it involves samples add this as a datatype #if it involves samples add this as a datatype
if process_yes_no_answer(sheet[9, 1]): if process_yes_no_answer(sheet[9, 1]):
...@@ -263,31 +266,31 @@ class DishXlsExporter: ...@@ -263,31 +266,31 @@ class DishXlsExporter:
dataset_dict["data_declarations"].append(datadec_dict) dataset_dict["data_declarations"].append(datadec_dict)
elif is_submission(sheet): elif is_submission(sheet):
dataset_dict["name"] = sheet[2, 1] dataset_dict["name"] = sheet[2, 1].strip()
dataset_dict["project"] = sheet[5, 1] dataset_dict["project"] = sheet[5, 1].strip()
dataset_dict["contacts"].extend([{"first_name": get_names_from_string(sheet[9, 1])[0], dataset_dict["contacts"].extend([{"first_name": get_names_from_string(sheet[9, 1])[0],
"last_name":get_names_from_string(sheet[9, 1])[1], "last_name":get_names_from_string(sheet[9, 1])[1],
"role": sheet[11,1], "role": sheet[11,1],
"email":sheet[10,1], "email":sheet[10,1].strip(),
"affiliations": [self.process_institution(sheet[7,1])] "affiliations": [self.process_institution(sheet[7,1])]
}, },
{"first_name": get_names_from_string(sheet[12, 1])[0], {"first_name": get_names_from_string(sheet[12, 1])[0],
"last_name":get_names_from_string(sheet[12, 1])[1], "last_name":get_names_from_string(sheet[12, 1])[1],
"role": "Legal_Representative", "role": "Legal_Representative",
"email":sheet[10,1], "email":sheet[13,1].strip(),
"affiliations": [self.process_institution(sheet[7,1])] "affiliations": [self.process_institution(sheet[7,1])]
}, },
{"first_name": get_names_from_string(sheet[14, 1])[0], {"first_name": get_names_from_string(sheet[14, 1])[0],
"last_name":get_names_from_string(sheet[14, 1])[1], "last_name":get_names_from_string(sheet[14, 1])[1],
"role": "Data_Protection_Officer", "role": "Data_Protection_Officer",
"email":sheet[10,1], "email":sheet[15,1].strip(),
"affiliations": [self.process_institution(sheet[7,1])] "affiliations": [self.process_institution(sheet[7,1])]
}]) }])
if sheet[16, 1] and sheet[18, 1]: if sheet[16, 1] and sheet[18, 1]:
dataset_dict["contacts"].append({"first_name": get_names_from_string(sheet[14, 1])[0], dataset_dict["contacts"].append({"first_name": get_names_from_string(sheet[16, 1])[0],
"last_name": get_names_from_string(sheet[14, 1])[1], "last_name": get_names_from_string(sheet[16, 1])[1],
"role": sheet[18,1], "role": sheet[18,1],
"email": sheet[10,1], "email": sheet[17,1].strip(),
"affiliations": [self.process_institution(sheet[7,1])] "affiliations": [self.process_institution(sheet[7,1])]
}) })
......
...@@ -81,23 +81,28 @@ def process_possible_date(possible_date): ...@@ -81,23 +81,28 @@ def process_possible_date(possible_date):
def get_names_from_string(full_name): def get_names_from_string(full_name):
result = ['', ''] result = ['', '']
name = full_name.strip() name = full_name.strip()
if name_cointains_title(name):
logging.error(f'Name contains titles: {name}')
if name.endswith(',') or name.endswith(','): if name.endswith(',') or name.endswith(','):
name = name[:-1] name = name[:-1]
if name is not None: if name is not None:
if " " in name: if " " in name:
name_list = name.split(" ") name_list = name.split(" ")
len_name = len(name_list)
result[0] = name_list[0] result[0] = name_list[0]
if len_name > 1: result[1] = " ".join(name_list[1:])
result[1] = name_list[1]
if len_name == 3:
result[1] = result[1] + ' ' + name_list[2]
else: else:
result[0] = name result[0] = name
return result return result
def name_cointains_title(full_name):
full_name_dotless = full_name.replace(".", " ").lower()
titles = ['dr', 'prof', 'mr', 'mrs', 'ms', 'msc', 'mga', 'mph', 'sc', 'drph', 'rndr', 'phd', 'ph', 'msi']
name_list = full_name_dotless.split(" ")
res = [ele for ele in titles if(ele in name_list)]
return bool(res)
def get_lines_from_string(a_string): def get_lines_from_string(a_string):
result = [] result = []
stripped = a_string.strip() stripped = a_string.strip()
...@@ -142,4 +147,4 @@ def save_exported_datasets_to_file(exported_dataset, output_file): ...@@ -142,4 +147,4 @@ def save_exported_datasets_to_file(exported_dataset, output_file):
"$schema": "https://git-r3lab.uni.lu/pinar.alper/metadata-tools/raw/master/metadata_tools/resources/elu-dataset.json", "$schema": "https://git-r3lab.uni.lu/pinar.alper/metadata-tools/raw/master/metadata_tools/resources/elu-dataset.json",
"items": items "items": items
} }
return json.dump(obj, output_file, indent=4) return json.dump(obj, output_file, ensure_ascii = False, indent=4)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment