diff --git a/smash/import_file.py b/smash/import_file.py index 0de8ca99ae12daaecb96167dcef98443422055b3..09cfd160bc9d3f6d2bd425026f3404a412c5721c 100644 --- a/smash/import_file.py +++ b/smash/import_file.py @@ -7,41 +7,67 @@ import pandas as pd import numpy as np import logging import datetime +import re +from operator import itemgetter +from collections import OrderedDict, defaultdict +import sys import string -from web.models.constants import SEX_CHOICES, SEX_CHOICES_MALE, SEX_CHOICES_FEMALE +from web.models.constants import VOUCHER_STATUS_IN_USE, SUBJECT_TYPE_CHOICES_PATIENT, GLOBAL_STUDY_ID, SEX_CHOICES, SEX_CHOICES_MALE, SEX_CHOICES_FEMALE from web.algorithm import VerhoeffAlgorithm, LuhnAlgorithm from web.utils import is_valid_social_security_number +from web.models import VoucherType, Voucher, Country, AppointmentTypeLink, AppointmentType, Study, Worker, Language, Subject, WorkerStudyRole, StudySubject, Location, FlyingTeam, Visit, Appointment, AppointmentType + +from web.models.worker_study_role import WORKER_STAFF, ROLE_CHOICES_SECRETARY, ROLE_CHOICES_HEALTH_PARTNER, \ + WORKER_HEALTH_PARTNER, ROLE_CHOICES_VOUCHER_PARTNER, WORKER_VOUCHER_PARTNER + +DEFAULT_LOCATION = 'CHL' +date_regex = re.compile(r'\d{1,2}\.\d{1,2}\.\d{4}') + + +def itembetter(items, lst): + if len(items) == 1: + return [itemgetter(*items)(lst)] + else: + return list(itemgetter(*items)(lst)) + + +def indexof(element, l): + return [i for i, x in enumerate(l) if x == element] + ''' -#Â Things that *could* aready be on the database: +#Â Things that *could* already be on the database: - Language - Country -- A subject with the same SS number -- A subject with the same ND number -- Referals (Health Partner) - Location - Flying Team +- Referals (Health Partner) +- A subject with the same SS number, first name and last name +- A studySubject with the same ND number, subject # Columns to be transformed to a standard format - Gender - Language - Prefered writen language +- Country - SS number - Date of birth -- Date added (V1) +- Date added (V1) # visits might have comments like (Tel) - ... (V2) - ... (V3) - ... (V4) +- Voucher activity (remove cells that include None in any form and split by breakline) +- Voucher reference (split) Boolean - Deceased - Postponed - Resigned - Excluded -- PDP 1.0 +- PDP 1.0 ''' @@ -49,21 +75,55 @@ Boolean Column Converter Functions ''' -# in converters dict +# converters +# Boolean: +# Deceased +# Postponed +# Resigned +# Excluded +# PDP 1.0 +# Flying Team (FT) +def parse_voucher_reference(vr): + vr = vr.strip() #strip spaces + return vr.split('\n') if vr != u'' else [] #if empty string then return empty list, otherwise split by break line -def parse_column_date_of_birth(date): - return datetime.datetime.strptime(date, '%d.%m.%Y').strftime('%Y-%m-%d') +def parse_voucher_type(vt): + vt = '' if 'NONE' in vt.upper() else vt #if vt includes none in any form, then return empty + vt = vt.strip() #strip spaces + return vt.split('\n') if vt != u'' else [] #if empty string then return empty list, otherwise split by break line +def parse_boolean(boolean_Y_N): + ''' + Return True if 'y' or 'Y' is found. + Otherwise return False even if it fails + ''' + try: + if isinstance(boolean_Y_N, float) and np.isnan(boolean_Y_N): + return False + elif boolean_Y_N.upper() == 'Y': + return True + else: + return False + except Exception as e: + logging.warn('parse_boolean failed for {}.'.format(boolean_Y_N)) + logging.warn('{} {}'.format(e.message, e.args)) + return False -gender_table = {'m': SEX_CHOICES_MALE, 'f': SEX_CHOICES_FEMALE} +# birth date +def parse_column_date_of_birth(date): + return datetime.datetime.strptime(date, '%d.%m.%Y').strftime('%Y-%m-%d') + +# gender +gender_table = {'m': SEX_CHOICES_MALE, 'f': SEX_CHOICES_FEMALE} def parse_column_gender(gender): try: return gender_table[gender.lower()] except: return None +# SS number def parse_column_ss_number(ss): @@ -80,13 +140,11 @@ def parse_column_ss_number(ss): if len(ss) == 13: if not is_valid_social_security_number(ss): logging.debug('Invalid SS number: |{}|'.format(ss)) - return None else: logging.debug('Invalid SS number: (Length not valid) |{}|'.format(ss)) - return ss return ss -# not in coverters dict +# Language language_table = { @@ -98,7 +156,8 @@ language_table = { 'E': 'English', 'P': 'Portuguese', 'A': 'Arabic', - 'SP': 'Spanish' + 'SP': 'Spanish', + 'FIN': 'Finnish' } language_translation_table = { @@ -116,7 +175,8 @@ def apply_column_prefered_language(languages): def apply_column_languages(languages): - if type(languages) != float: + languages = languages.strip() + if type(languages) != float and len(languages) > 0: # replacements and transformations languages = unicode(languages).upper().translate( language_translation_table) @@ -128,28 +188,306 @@ def apply_column_languages(languages): return np.array(new_list) else: logging.debug( - 'Parse Languages: Empty or invalid Languages: |{}|'.format(languages)) + 'Parse Languages: Empty, NaN, or invalid Languages: |{}|'.format(languages)) return np.array([]) -converters = { - 'Date of birth': parse_column_date_of_birth, - 'Gender': parse_column_gender, - 'SS Number': parse_column_ss_number +# Country + +country_table = { + 'LUX': 'Luxembourg' } -dtype = { - 'Languages': object, - 'Prefered writen language': object + +def apply_column_country(country): + try: + return country_table[country] + except: + logging.warn('Invalid Country: {}'.format(country)) + return country + +''' +Instead of using the converters parameter from read_excel method, +we opt for make the transformations later since the read_excel method does not allow +converters that return a list. +''' +converters = { + 'DATE OF BIRTH': parse_column_date_of_birth, + 'GENDER': parse_column_gender, + 'SS NUMBER': parse_column_ss_number, + 'COUNTRY': apply_column_country, + 'LANGUAGES': apply_column_languages, + 'PREFERED WRITEN LANGUAGE': apply_column_prefered_language, + 'DECEASED': parse_boolean, + 'POSTPONED': parse_boolean, + 'RESIGNED': parse_boolean, + 'EXCLUDED': parse_boolean, + 'PDP 1.0': parse_boolean, + 'FLYING TEAM (FT)': parse_boolean, + 'VOUCHER ACTIVITY': parse_voucher_type, + 'VOUCHER REFERENCE': parse_voucher_reference } +# add voucher for subject +voucher_partners = {} +voucher_partners['ZIT'] = 'Zitha' +def add_subject_vouchers(voucher_reference, referral, voucher_types): + nd_number, date, voucher_partner, voucher_type, num = voucher_reference.split('-') + issue_date = datetime.datetime.strptime(date, '%Y%m%d') + expiry_date = issue_date + datetime.timedelta(days=365) + usage_partner, created = Worker.objects.update_or_create( + name=voucher_partners.get(voucher_partner, voucher_partner)) + usage_partner.roles.update(role=ROLE_CHOICES_VOUCHER_PARTNER) + # create workerStudyRole + workerStudyRole, _ = WorkerStudyRole.objects.update_or_create(worker=usage_partner, + study_id=GLOBAL_STUDY_ID, role=ROLE_CHOICES_VOUCHER_PARTNER) + usage_partner.voucher_types.set(voucher_types.values()) + usage_partner.save() + + if created: + logging.warn('New Voucher Partner created: {}'.format(voucher_partner)) + + vt = VoucherType.objects.get(code=voucher_type) + + study_subject = StudySubject.objects.get(nd_number=nd_number) + + voucher, created = Voucher.objects.update_or_create(number=voucher_reference, issue_date=issue_date, + expiry_date=expiry_date, voucher_type=vt, study_subject=study_subject, + status=VOUCHER_STATUS_IN_USE, usage_partner=usage_partner, issue_worker=referral) + logging.warn('New Voucher added: {}'.format(voucher_reference)) + return voucher -def processFile(file, converters=converters, dtype=dtype): - return pd.read_excel(file, converters=converters, dtype=dtype) + +# create voucher types +def create_voucher_types(voucher_types_dict, study): + voucher_types = {} + for name, code in voucher_types_dict.items(): + voucher_type, _ = VoucherType.objects.update_or_create(code=code, description=name, study=study) + voucher_types[name] = voucher_type + return voucher_types + +# create appointment types +def create_appointment_types(assessments): + appointmentTypes = [] + for name, duration in assessments.items(): + code = filter(str.isupper, name) + appointmentType, _ = AppointmentType.objects.update_or_create( + code=code, default_duration=duration, description=name) + appointmentType.save() + appointmentTypes.append(appointmentType) + return appointmentTypes + + +def parse_row(index, row, visit_columns, appointmentTypes, voucher_types): + # Languages + if len(row['LANGUAGES']) == 0 and len(row['PREFERED WRITEN LANGUAGE']) == 0: + logging.warn('No Languages available') + elif len(row['LANGUAGES']) == 0 and len(row['PREFERED WRITEN LANGUAGE']) > 0: + row['LANGUAGES'] = row['PREFERED WRITEN LANGUAGE'] + elif len(row['LANGUAGES']) > 0 and len(row['PREFERED WRITEN LANGUAGE']) == 0: + row['PREFERED WRITEN LANGUAGE'] = row['LANGUAGES'] + + languages = [] + for language in row['LANGUAGES']: + lang, created = Language.objects.get_or_create( + name=language) + languages.append(lang) + if created: + logging.warn('New Language added: {}'.format(language)) + lang.save() + + for language in row['PREFERED WRITEN LANGUAGE'][:1]: + pref_lang, created = Language.objects.get_or_create(name=language) + if created: + logging.warn( + 'New Language (from Prefered) added: {}'.format(language)) + pref_lang.save() + + # Country + country = row['COUNTRY'] + country, created = Country.objects.get_or_create(name=country) + if created: + logging.warn('New Country added: {}'.format(row['COUNTRY'])) + country.save() + + # Location and Flying Team + #Â If no FT, then default location is CHL + ft = None + location = None + if not row['FLYING TEAM (FT)']: + location, created = Location.objects.get_or_create( + name=DEFAULT_LOCATION) + if created: + logging.warn('New location added: {}'.format(DEFAULT_LOCATION)) + location.save() + else: + location, created = Location.objects.get_or_create( + name='Flying Team') + if created: + logging.warn('New location added: Flying Team') + location.save() + # Create Flying Team + ft, created = FlyingTeam.objects.get_or_create( + place=row['LOCATION OF FT']) + if created: + logging.warn('New Flying Team added: {}'.format( + row['LOCATION OF FT'])) + ft.save() + + # Health Partner + # create health partner (Referral) + health_partner, created = Worker.objects.get_or_create(name=row['REFERRAL']) + health_partner.roles.update(role=ROLE_CHOICES_HEALTH_PARTNER) + # create workerStudyRole + workerStudyRole, _ = WorkerStudyRole.objects.update_or_create( + worker=health_partner, study_id=GLOBAL_STUDY_ID, role=ROLE_CHOICES_HEALTH_PARTNER) + health_partner.save() + if created: + logging.warn('New Health Partner added: {}'.format(row['REFERRAL'])) + + subject, created = Subject.objects.get_or_create(social_security_number=row['SS NUMBER'], + first_name=row['FIRST NAME'], + last_name=row['LAST NAME'], + defaults={ + 'social_security_number': row['SS NUMBER'], + 'first_name': row['FIRST NAME'], + 'last_name': row['LAST NAME'], + 'sex': row['GENDER'], + 'phone_number': row['PHONE NUMBER 1'], + 'phone_number_2': row['PHONE NUMBER 2'], + 'email': row['E-MAIL'], + 'date_born': row['DATE OF BIRTH'], + 'address': row['ADDRESS'], + 'postal_code': row['POSTAL CODE'], + 'city': row['CITY'], + 'country': country, + 'dead': row['DECEASED'], + 'default_written_communication_language': pref_lang + }) + + subject.languages.set(languages) + subject.save() + + if created: + logging.warn('New Subject added with SS number: {}'.format(row['SS NUMBER'])) + + # StudySubject + study = Study.objects.filter(id=GLOBAL_STUDY_ID)[0] + + studySubject, created = StudySubject.objects.get_or_create(subject=subject, nd_number=row['ND NUMBER'], + defaults={ + 'subject': subject, + 'study': study, + 'postponed': row['POSTPONED'], + 'nd_number': row['ND NUMBER'], + 'resigned': row['RESIGNED'], + 'resign_reason': row['REASON'], + 'type': SUBJECT_TYPE_CHOICES_PATIENT, + 'excluded': row['EXCLUDED'], + 'exclude_reason': row['REASON.1'], + 'previously_in_study': row['PDP 1.0'], + 'comments': row['COMMENT'], + 'date_added': parse_column_date_of_birth(row['DATE ADDED (V1)']) + }) + + #all study subjects can have all voucher types + studySubject.voucher_types.set(voucher_types.values()) + studySubject.save() + + if created: + logging.warn('New StudySubject added with ND number: {}'.format(row['ND NUMBER'])) + + #VOUCHERS + voucher_references = row['VOUCHER REFERENCE'] + for voucher_reference in voucher_references: + voucher = add_subject_vouchers(voucher_reference, health_partner, voucher_types) + + # Visits + # Consider all visits as part of the same visit with multiple appointments + appointments = [] + appointment = None + + ''' + map(date_regex.findall gets all the dates in the strings ignoring comments such as Tel + sum(Ans, []) flattens the resulting list from the map since each findall returns a list + map to convert string to datetime + ''' + visit_dates = map(lambda x: datetime.datetime.strptime( + x, '%d.%m.%Y'), sum(map(date_regex.findall, row[visit_columns].values), [])) + + # get first and last elements of the sorted element + datetime_begin, datetime_end = itemgetter(*[0, -1])(sorted(visit_dates)) + datetime_begin = datetime_begin.strftime('%Y-%m-%d') + datetime_end = datetime_end.strftime('%Y-%m-%d') + + visit, created = Visit.objects.get_or_create( + subject=studySubject, datetime_begin=datetime_begin, datetime_end=datetime_end, defaults={ + 'is_finished': True}) + if created: + logging.warn('New Visit added for ND number {} starting on {}'.format( + row['ND NUMBER'], datetime_begin)) + + appointment_types = appointmentTypes[:len(set(visit_dates))] #in this case appointment types are incremental + visit.appointment_types.set(appointment_types) + visit.save() + + ''' + If there are two Vx with the same date we put together the appointment types in the same appointment + ''' + for visit_date in set(visit_dates): + datetime_when = visit_date.strftime('%Y-%m-%d') + + # Â get the indices of each occurrence of the date and use them to get + # the appointment types + appointment_types = itembetter( + indexof(visit_date, visit_dates), appointmentTypes) + + # creatre appointment + appointment, _ = Appointment.objects.update_or_create( + visit=visit, length=sum( + [a.default_duration for a in appointment_types]), + flying_team=ft, location=location, + status=Appointment.APPOINTMENT_STATUS_FINISHED, datetime_when=datetime_when) + + date_when = visit_date.replace( + hour=9, minute=0, second=0, microsecond=0) + for appointment_type in appointment_types: + app_type_link = AppointmentTypeLink( + appointment=appointment, date_when=date_when, + appointment_type=appointment_type) + date_when += datetime.timedelta( + minutes=appointment_type.default_duration) if __name__ == '__main__': logging.basicConfig(level=logging.DEBUG) - df = processFile( - '/Users/carlos.vega/ownCloud/Documents/Projects/PDP/copy.xlsx') - df['Languages'] = df['Languages'].apply(apply_column_languages) - df['Prefered writen language'] = df[ - 'Prefered writen language'].apply(apply_column_prefered_language) + file = '/Users/carlos.vega/ownCloud/Documents/Projects/PDP/copy.xlsx' + df = pd.read_excel(file, dtype=object) + df = df.fillna('').astype(unicode) + df.columns = [c.upper() for c in df.columns] + # make transformations + for column, function in converters.items(): + logging.warn(column) + df[column] = df[column].apply(function) + # get visits columns + regex = re.compile(r'\(V\d\)') + #Â + study = Study.objects.filter(id=GLOBAL_STUDY_ID)[0] + #enable vouchers + study.columns.voucher_types = True + study.columns.vouchers = True + study.columns.save() + study.save() + # + visit_columns = filter(regex.search, df.columns) + + assessments = OrderedDict([('Cognitive Test', 180), ('Risk Factor', 120), + ('Voucher Distribution', 120), ('Follow Up', 90)]) + appointmentTypes = create_appointment_types(assessments) + + voucher_types_dict = OrderedDict([('Cognitive Activity', 'CA'), ('Neurofit', 'NF'), ('Mobilfit', 'MF'), ('Diet', 'D'), + ('Consulte ORL', 'CORL'), ('Physical Activity', 'PA'), ('Individual Cognitive Training', 'IT'), ('Social', 'S'), ('Test', 'T')]) + + voucher_types = create_voucher_types(voucher_types_dict, study) + + # process each row + for index, row in df.iterrows(): + parse_row(index, row, visit_columns, appointmentTypes, voucher_types)