Commit ba354a57 authored by Pinar Alper's avatar Pinar Alper
Browse files

Fixed date extraction

parent 4e393f4c
......@@ -124,9 +124,6 @@
]
}
},
"source_type": {
"type": "string"
},
"source_project": {
"type": "string"
},
......
......@@ -6,7 +6,7 @@ from unittest import TestCase
import pyexcel
from tests.importxls.test_utils import get_value_list_from_row, process_data_types, process_yes_no_answer, \
process_yes_no_dontknow_answer, add_storage_locations, SHEETS_FOLDER
process_yes_no_dontknow_answer, add_storage_locations, SHEETS_FOLDER, process_possible_date
class TestProjectsParser(TestCase):
......@@ -22,7 +22,7 @@ class TestProjectsParser(TestCase):
submission_id = 'IMP_FC_{}'.format(str(int(h.hexdigest(), 16)))
book = pyexcel.get_book(file_name=full_file_path)
idx = 1
# print('----> {}'.format(full_file_path))
#print('----> {}'.format(full_file_path))
while idx < book.number_of_sheets():
sheet = book.sheet_by_index(idx)
dataset_data = {}
......@@ -97,7 +97,7 @@ class TestProjectsParser(TestCase):
has_time_limis = process_yes_no_dontknow_answer(sheet[41, 2])
if has_time_limis and sheet[42, 2]:
use_restrictions.append({'ga4gh_code': 'TS-[XX]',
'note': 'Data is obtained for a limited duration.' + sheet[42, 2]})
'note': 'Data is obtained for a limited duration.' + process_possible_date(sheet[42, 2])})
dataset_data['use_restrictions'] = use_restrictions
......
......@@ -6,7 +6,7 @@ from unittest import TestCase
import pyexcel
from tests.importxls.test_utils import get_value_list_from_row, process_data_types, process_yes_no_dontknow_answer, \
process_yes_no_answer, add_storage_locations, SHEETS_FOLDER
process_yes_no_answer, add_storage_locations, SHEETS_FOLDER, process_possible_date
class TestProjectsParser(TestCase):
......@@ -68,7 +68,7 @@ class TestProjectsParser(TestCase):
has_time_limis = process_yes_no_dontknow_answer(sheet[27, 2])
if has_time_limis and sheet[28, 2]:
use_restrictions.append({'ga4gh_code': 'TS-[XX]',
'note': 'Data is obtained for a limited duration.' + sheet[28, 2]})
'note': 'Data is obtained for a limited duration.' + process_possible_date(sheet[28, 2])})
if process_yes_no_answer(sheet[29, 2]):
use_restrictions.append({'ga4gh_code': 'PUB',
......
......@@ -6,7 +6,7 @@ from unittest import TestCase
import pyexcel
from tests.importxls.test_utils import get_value_list_from_row, process_data_types, process_yes_no_answer, \
process_yes_no_dontknow_answer, add_storage_locations, SHEETS_FOLDER
process_yes_no_dontknow_answer, add_storage_locations, SHEETS_FOLDER, process_possible_date
class TestProjectsParser(TestCase):
......@@ -23,7 +23,7 @@ class TestProjectsParser(TestCase):
submission_id = 'IMP_OC_{}'.format(str(int(h.hexdigest(), 16)))
book = pyexcel.get_book(file_name=full_file_path)
idx = 1
# print('----> {}'.format(full_file_path))
print('----> {}'.format(full_file_path))
while idx < book.number_of_sheets():
sheet = book.sheet_by_index(idx)
dataset_data = {}
......@@ -92,7 +92,7 @@ class TestProjectsParser(TestCase):
has_time_limis = process_yes_no_dontknow_answer(sheet[42, 2])
if has_time_limis and sheet[43, 2]:
use_restrictions.append({'ga4gh_code': 'TS-[XX]',
'note': 'Data is obtained for a limited duration.' + sheet[43, 2]})
'note': 'Data is obtained for a limited duration.' + process_possible_date(sheet[43, 2])})
dataset_data['use_restrictions'] = use_restrictions
......
......@@ -7,7 +7,7 @@ import datetime
import pyexcel
from tests.importxls.test_utils import get_value_list_from_row, is_data_sheet, collect_prj_info, \
get_names_from_string, SHEETS_FOLDER
get_names_from_string, SHEETS_FOLDER, process_possible_date
class TestProjectsParser(TestCase):
......@@ -55,6 +55,42 @@ class TestProjectsParser(TestCase):
idx += 1
return
def test_duplicate_dataset_title(self):
titles = set()
for dirName, subdirList, fileList in os.walk(SHEETS_FOLDER):
for fname in fileList:
if is_data_sheet(fname):
full_file_path = os.path.join(dirName, fname)
book = pyexcel.get_book(file_name=full_file_path)
idx = 0
while idx < book.number_of_sheets():
if idx > 0 and fname.startswith('from-repository'):
title = book.sheet_by_index(idx)[5, 2]
if titles.__contains__(title):
print('Duplicate title {} in file {}'.format(title, full_file_path))
else:
titles.add(title)
if idx > 0 and fname.startswith('from-collaborator'):
title = book.sheet_by_index(idx)[4, 2]
if titles.__contains__(title):
print('Duplicate title {} in file {}'.format(title, full_file_path))
else:
titles.add(title)
if idx > 0 and fname.startswith('own-cohort'):
title = book.sheet_by_index(idx)[4, 2]
if titles.__contains__(title):
print('Duplicate title {} in file {}'.format(title, full_file_path))
else:
titles.add(title)
idx += 1
return
def test_export_projects(self):
projects_list = []
......@@ -64,15 +100,8 @@ class TestProjectsParser(TestCase):
prj_data['acronym'] = acr
prj_data['title'] = title
prj_data['description'] = description
if type(start) is datetime.date:
prj_data['start_date'] = start.strftime('%m/%d/%Y')
elif type(start) is str:
prj_data['start_date'] = start.replace('.', '/')
if type(end) is datetime.date:
prj_data['end_date'] = end.strftime('%m/%d/%Y')
elif type(end) is str:
prj_data['end_date'] = end.replace('.', '/')
prj_data['start_date'] = process_possible_date(start)
prj_data['end_date'] = process_possible_date(end)
contacts_list = []
delimeter = ','
if ';' in pi:
......
import os
import datetime
import pyexcel
SHEETS_FOLDER = '/Users/pinar_alper/desktop/test-ANSWERS'
SHEETS_FOLDER = '/Users/pinar_alper/ownCloud/Data Protection/RequirementAnalysis/LCSB-Inventory/internal_data_survey/ANSWERS'
def is_data_sheet(fname):
return fname.startswith('from-repository') or fname.startswith('from-collaborator') or fname.startswith(
......@@ -50,7 +52,7 @@ def collect_prj_info(sheets_folder):
sheet = book.sheet_by_name('projects')
prj_acronyms = sheet.column[1]
numprojects = len(prj_acronyms) - 2
# print('{}---> {} ----> {}'.format(fname, len(prj_acronyms), prj_acronyms))
print('{}---> {} ----> {}'.format(fname, len(prj_acronyms), prj_acronyms))
if numprojects > 0:
for row in range(2, 2 + numprojects):
projects.append((sheet[row, 1], full_file_path, sheet[row, 2], sheet[row, 3], sheet[row, 4],
......@@ -58,6 +60,11 @@ def collect_prj_info(sheets_folder):
sheet[row, 10], sheet[row, 11], sheet[row, 12]))
return projects
def process_possible_date(possible_date):
if isinstance(possible_date, datetime.date):
return possible_date.strftime("%Y/%m/%d")
else:
return str(possible_date).replace('.', '/')
def process_data_types(xls_data_type_list):
result = []
......@@ -194,9 +201,12 @@ def get_names_from_string(full_name):
if name is not None:
if " " in name:
name_list = name.split(" ")
len_name = len(name_list)
result[0] = name_list[0]
if len(name_list) > 1:
if len_name > 1:
result[1] = name_list[1]
if len_name == 3:
result[1] = result[1] + ' ' + name_list[2]
else:
result[0] = name
return result
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment