diff --git a/smash/import_file.py b/smash/import_file.py new file mode 100644 index 0000000000000000000000000000000000000000..0de8ca99ae12daaecb96167dcef98443422055b3 --- /dev/null +++ b/smash/import_file.py @@ -0,0 +1,155 @@ +import os +import django +from django.conf import settings +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "smash.settings") +django.setup() +import pandas as pd +import numpy as np +import logging +import datetime +import string +from web.models.constants import SEX_CHOICES, SEX_CHOICES_MALE, SEX_CHOICES_FEMALE +from web.algorithm import VerhoeffAlgorithm, LuhnAlgorithm +from web.utils import is_valid_social_security_number + +''' + +#Â Things that *could* aready be on the database: + +- Language +- Country +- A subject with the same SS number +- A subject with the same ND number +- Referals (Health Partner) +- Location +- Flying Team + +# Columns to be transformed to a standard format + +- Gender +- Language +- Prefered writen language +- SS number +- Date of birth +- Date added (V1) +- ... (V2) +- ... (V3) +- ... (V4) + +Boolean +- Deceased +- Postponed +- Resigned +- Excluded +- PDP 1.0 + +''' + +''' +Column Converter Functions +''' + +# in converters dict + + +def parse_column_date_of_birth(date): + return datetime.datetime.strptime(date, '%d.%m.%Y').strftime('%Y-%m-%d') + + +gender_table = {'m': SEX_CHOICES_MALE, 'f': SEX_CHOICES_FEMALE} + + +def parse_column_gender(gender): + try: + return gender_table[gender.lower()] + except: + return None + + +def parse_column_ss_number(ss): + ss = ss.replace(' ', '') + if not ss.isdigit(): + return None + if len(ss) == 11: + logging.debug('Old SS number: |{}|'.format(ss)) + eleventh_digit = VerhoeffAlgorithm.calculate_verhoeff_check_sum(ss[ + :11]) + twelfth_digit = LuhnAlgorithm.calculate_luhn_checksum(ss[:11]) + ss = ss + eleventh_digit + twelfth_digit + # we revalidate the old ss too after we append the digits + if len(ss) == 13: + if not is_valid_social_security_number(ss): + logging.debug('Invalid SS number: |{}|'.format(ss)) + return None + else: + logging.debug('Invalid SS number: (Length not valid) |{}|'.format(ss)) + return ss + return ss + +# not in coverters dict + + +language_table = { + 'L': 'Luxembourgish', + 'LT': 'Lithuanian', + 'IT': 'Italian', + 'F': 'French', + 'D': 'German', + 'E': 'English', + 'P': 'Portuguese', + 'A': 'Arabic', + 'SP': 'Spanish' +} + +language_translation_table = { + # deletions + ord(u')'): None, + ord(u'('): None, + ord(u' '): None, + # replacements + ord(u','): u';' +} + + +def apply_column_prefered_language(languages): + return apply_column_languages(languages)[:1] + + +def apply_column_languages(languages): + if type(languages) != float: + # replacements and transformations + languages = unicode(languages).upper().translate( + language_translation_table) + new_list = [] + for language in languages.split(';'): + if language in language_table: + language = language_table[language] + new_list.append(language) + return np.array(new_list) + else: + logging.debug( + 'Parse Languages: Empty or invalid Languages: |{}|'.format(languages)) + return np.array([]) + +converters = { + 'Date of birth': parse_column_date_of_birth, + 'Gender': parse_column_gender, + 'SS Number': parse_column_ss_number +} + +dtype = { + 'Languages': object, + 'Prefered writen language': object +} + + +def processFile(file, converters=converters, dtype=dtype): + return pd.read_excel(file, converters=converters, dtype=dtype) + +if __name__ == '__main__': + logging.basicConfig(level=logging.DEBUG) + df = processFile( + '/Users/carlos.vega/ownCloud/Documents/Projects/PDP/copy.xlsx') + df['Languages'] = df['Languages'].apply(apply_column_languages) + df['Prefered writen language'] = df[ + 'Prefered writen language'].apply(apply_column_prefered_language)