Skip to content
Snippets Groups Projects
import_file.py 3.72 KiB
Newer Older
import os
import django
from django.conf import settings
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "smash.settings")
django.setup()
import pandas as pd
import numpy as np
import logging
import datetime
import string
from web.models.constants import SEX_CHOICES, SEX_CHOICES_MALE, SEX_CHOICES_FEMALE
from web.algorithm import VerhoeffAlgorithm, LuhnAlgorithm
from web.utils import is_valid_social_security_number

'''

# Things that *could* aready be on the database:

- Language
- Country
- A subject with the same SS number
- A subject with the same ND number
- Referals (Health Partner)
- Location
- Flying Team

# Columns to be transformed to a standard format

- Gender
- Language
- Prefered writen language
- SS number
- Date of birth
- Date added (V1)
- ... (V2)
- ... (V3)
- ... (V4)

Boolean
- Deceased
- Postponed
- Resigned
- Excluded
- PDP 1.0 

'''

'''
Column Converter Functions
'''

# in converters dict


def parse_column_date_of_birth(date):
    return datetime.datetime.strptime(date, '%d.%m.%Y').strftime('%Y-%m-%d')


gender_table = {'m': SEX_CHOICES_MALE, 'f': SEX_CHOICES_FEMALE}


def parse_column_gender(gender):
    try:
        return gender_table[gender.lower()]
    except:
        return None


def parse_column_ss_number(ss):
    ss = ss.replace(' ', '')
    if not ss.isdigit():
        return None
    if len(ss) == 11:
        logging.debug('Old SS number: |{}|'.format(ss))
        eleventh_digit = VerhoeffAlgorithm.calculate_verhoeff_check_sum(ss[
                                                                        :11])
        twelfth_digit = LuhnAlgorithm.calculate_luhn_checksum(ss[:11])
        ss = ss + eleventh_digit + twelfth_digit
    # we revalidate the old ss too after we append the digits
    if len(ss) == 13:
        if not is_valid_social_security_number(ss):
            logging.debug('Invalid SS number: |{}|'.format(ss))
            return None
    else:
        logging.debug('Invalid SS number: (Length not valid) |{}|'.format(ss))
        return ss
    return ss

# not in coverters dict


language_table = {
    'L': 'Luxembourgish',
    'LT': 'Lithuanian',
    'IT': 'Italian',
    'F': 'French',
    'D': 'German',
    'E': 'English',
    'P': 'Portuguese',
    'A': 'Arabic',
    'SP': 'Spanish'
}

language_translation_table = {
    # deletions
    ord(u')'): None,
    ord(u'('): None,
    ord(u' '): None,
    # replacements
    ord(u','): u';'
}


def apply_column_prefered_language(languages):
    return apply_column_languages(languages)[:1]


def apply_column_languages(languages):
    if type(languages) != float:
        # replacements and transformations
        languages = unicode(languages).upper().translate(
            language_translation_table)
        new_list = []
        for language in languages.split(';'):
            if language in language_table:
                language = language_table[language]
            new_list.append(language)
        return np.array(new_list)
    else:
        logging.debug(
            'Parse Languages: Empty or invalid Languages: |{}|'.format(languages))
        return np.array([])

converters = {
    'Date of birth': parse_column_date_of_birth,
    'Gender': parse_column_gender,
    'SS Number': parse_column_ss_number
}

dtype = {
    'Languages': object,
    'Prefered writen language': object
}


def processFile(file, converters=converters, dtype=dtype):
    return pd.read_excel(file, converters=converters, dtype=dtype)

if __name__ == '__main__':
    logging.basicConfig(level=logging.DEBUG)
    df = processFile(
        '/Users/carlos.vega/ownCloud/Documents/Projects/PDP/copy.xlsx')
    df['Languages'] = df['Languages'].apply(apply_column_languages)
    df['Prefered writen language'] = df[
        'Prefered writen language'].apply(apply_column_prefered_language)