import os import django from django.conf import settings os.environ.setdefault("DJANGO_SETTINGS_MODULE", "smash.settings") django.setup() import pandas as pd import numpy as np import logging import datetime import string from web.models.constants import SEX_CHOICES, SEX_CHOICES_MALE, SEX_CHOICES_FEMALE from web.algorithm import VerhoeffAlgorithm, LuhnAlgorithm from web.utils import is_valid_social_security_number ''' # Things that *could* aready be on the database: - Language - Country - A subject with the same SS number - A subject with the same ND number - Referals (Health Partner) - Location - Flying Team # Columns to be transformed to a standard format - Gender - Language - Prefered writen language - SS number - Date of birth - Date added (V1) - ... (V2) - ... (V3) - ... (V4) Boolean - Deceased - Postponed - Resigned - Excluded - PDP 1.0 ''' ''' Column Converter Functions ''' # in converters dict def parse_column_date_of_birth(date): return datetime.datetime.strptime(date, '%d.%m.%Y').strftime('%Y-%m-%d') gender_table = {'m': SEX_CHOICES_MALE, 'f': SEX_CHOICES_FEMALE} def parse_column_gender(gender): try: return gender_table[gender.lower()] except: return None def parse_column_ss_number(ss): ss = ss.replace(' ', '') if not ss.isdigit(): return None if len(ss) == 11: logging.debug('Old SS number: |{}|'.format(ss)) eleventh_digit = VerhoeffAlgorithm.calculate_verhoeff_check_sum(ss[ :11]) twelfth_digit = LuhnAlgorithm.calculate_luhn_checksum(ss[:11]) ss = ss + eleventh_digit + twelfth_digit # we revalidate the old ss too after we append the digits if len(ss) == 13: if not is_valid_social_security_number(ss): logging.debug('Invalid SS number: |{}|'.format(ss)) return None else: logging.debug('Invalid SS number: (Length not valid) |{}|'.format(ss)) return ss return ss # not in coverters dict language_table = { 'L': 'Luxembourgish', 'LT': 'Lithuanian', 'IT': 'Italian', 'F': 'French', 'D': 'German', 'E': 'English', 'P': 'Portuguese', 'A': 'Arabic', 'SP': 'Spanish' } language_translation_table = { # deletions ord(u')'): None, ord(u'('): None, ord(u' '): None, # replacements ord(u','): u';' } def apply_column_prefered_language(languages): return apply_column_languages(languages)[:1] def apply_column_languages(languages): if type(languages) != float: # replacements and transformations languages = unicode(languages).upper().translate( language_translation_table) new_list = [] for language in languages.split(';'): if language in language_table: language = language_table[language] new_list.append(language) return np.array(new_list) else: logging.debug( 'Parse Languages: Empty or invalid Languages: |{}|'.format(languages)) return np.array([]) converters = { 'Date of birth': parse_column_date_of_birth, 'Gender': parse_column_gender, 'SS Number': parse_column_ss_number } dtype = { 'Languages': object, 'Prefered writen language': object } def processFile(file, converters=converters, dtype=dtype): return pd.read_excel(file, converters=converters, dtype=dtype) if __name__ == '__main__': logging.basicConfig(level=logging.DEBUG) df = processFile( '/Users/carlos.vega/ownCloud/Documents/Projects/PDP/copy.xlsx') df['Languages'] = df['Languages'].apply(apply_column_languages) df['Prefered writen language'] = df[ 'Prefered writen language'].apply(apply_column_prefered_language)