Skip to content
Snippets Groups Projects
Commit c31c62da authored by Carlos Vega's avatar Carlos Vega
Browse files

WIP: import file for pdp project

parent db04d75d
No related branches found
No related tags found
1 merge request!178Feature/import data pdp
import os
import django
from django.conf import settings
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "smash.settings")
django.setup()
import pandas as pd
import numpy as np
import logging
import datetime
import string
from web.models.constants import SEX_CHOICES, SEX_CHOICES_MALE, SEX_CHOICES_FEMALE
from web.algorithm import VerhoeffAlgorithm, LuhnAlgorithm
from web.utils import is_valid_social_security_number
'''
# Things that *could* aready be on the database:
- Language
- Country
- A subject with the same SS number
- A subject with the same ND number
- Referals (Health Partner)
- Location
- Flying Team
# Columns to be transformed to a standard format
- Gender
- Language
- Prefered writen language
- SS number
- Date of birth
- Date added (V1)
- ... (V2)
- ... (V3)
- ... (V4)
Boolean
- Deceased
- Postponed
- Resigned
- Excluded
- PDP 1.0
'''
'''
Column Converter Functions
'''
# in converters dict
def parse_column_date_of_birth(date):
return datetime.datetime.strptime(date, '%d.%m.%Y').strftime('%Y-%m-%d')
gender_table = {'m': SEX_CHOICES_MALE, 'f': SEX_CHOICES_FEMALE}
def parse_column_gender(gender):
try:
return gender_table[gender.lower()]
except:
return None
def parse_column_ss_number(ss):
ss = ss.replace(' ', '')
if not ss.isdigit():
return None
if len(ss) == 11:
logging.debug('Old SS number: |{}|'.format(ss))
eleventh_digit = VerhoeffAlgorithm.calculate_verhoeff_check_sum(ss[
:11])
twelfth_digit = LuhnAlgorithm.calculate_luhn_checksum(ss[:11])
ss = ss + eleventh_digit + twelfth_digit
# we revalidate the old ss too after we append the digits
if len(ss) == 13:
if not is_valid_social_security_number(ss):
logging.debug('Invalid SS number: |{}|'.format(ss))
return None
else:
logging.debug('Invalid SS number: (Length not valid) |{}|'.format(ss))
return ss
return ss
# not in coverters dict
language_table = {
'L': 'Luxembourgish',
'LT': 'Lithuanian',
'IT': 'Italian',
'F': 'French',
'D': 'German',
'E': 'English',
'P': 'Portuguese',
'A': 'Arabic',
'SP': 'Spanish'
}
language_translation_table = {
# deletions
ord(u')'): None,
ord(u'('): None,
ord(u' '): None,
# replacements
ord(u','): u';'
}
def apply_column_prefered_language(languages):
return apply_column_languages(languages)[:1]
def apply_column_languages(languages):
if type(languages) != float:
# replacements and transformations
languages = unicode(languages).upper().translate(
language_translation_table)
new_list = []
for language in languages.split(';'):
if language in language_table:
language = language_table[language]
new_list.append(language)
return np.array(new_list)
else:
logging.debug(
'Parse Languages: Empty or invalid Languages: |{}|'.format(languages))
return np.array([])
converters = {
'Date of birth': parse_column_date_of_birth,
'Gender': parse_column_gender,
'SS Number': parse_column_ss_number
}
dtype = {
'Languages': object,
'Prefered writen language': object
}
def processFile(file, converters=converters, dtype=dtype):
return pd.read_excel(file, converters=converters, dtype=dtype)
if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
df = processFile(
'/Users/carlos.vega/ownCloud/Documents/Projects/PDP/copy.xlsx')
df['Languages'] = df['Languages'].apply(apply_column_languages)
df['Prefered writen language'] = df[
'Prefered writen language'].apply(apply_column_prefered_language)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment