CSV Loader
Packages needed:
os
pandas
sqlite3
numpy
This sets of scripts works as 2 independent situations:
[1] Bulk folder load
The function in this script looks inside a folder and using all *.csv begins a process of concatenating all csvs together and removing duplpicates on each merge.
import pandas as pd, os, pdb, sqlite3, numpy as np
pd.set_option('display.min_rows', 25) # How many to show
pd.set_option('display.max_rows', 25) # How many to show
death_folder = 'data_Death'
intubation_folder = 'data_Intubation'
demographics_folder = 'data_demographics'
covid_test_results_folder = 'data_COVID_TestResults'
chest_imaging_reports_folder = 'data_chestImagingReports'
db_dir = 'db'
# Death example for function (Could require lots of Ram and DB grows in size...poorly coded to scale, but good for initializing a DB.)
def create_table_from_folder_csvs(folder=None, demographic_table=False):
# Could freeze 8 GB RAM machines as this lazily reads all csvs into RAM
# HOWEVER IF YOU HAVE A MONSTER MACHINE USE IT...IT'S ABOUT 50+ TIMES FASTER TO USE THIS FUNCTION IF YOU CAN
# Recommended: Use only a coulple files to define scheme and then use load csvs to load 1 at a time.
conn = sqlite3.connect(os.path.join(db_dir,'covid.db'))
if not isinstance(folder, str):
return("Not a valid folder")
files = [i for i in os.listdir(folder) if i.find('.csv') != -1]
print(files)
# base frame for all but demographics data
data = pd.DataFrame()
# base frame for demographics data (2 table types)
demographics = pd.DataFrame()
socialhistory = pd.DataFrame()
for file in files:
csv = pd.read_csv(os.path.join(folder, file))
# demographic tables have 2 schemes
if demographic_table == True:
if file.find('Demographics') != -1:
demographics = pd.concat([demographics, csv], axis=0)
print(demographics.shape)
demographics = demographics[~demographics.duplicated(keep='first')]
elif file.find('SocialHistory') != -1:
socialhistory = pd.concat([socialhistory, csv], axis=0)
print(socialhistory.shape)
socialhistory = socialhistory[~socialhistory.duplicated(keep='first')]
else:
data = pd.concat([data, csv], axis=0)
print(data.shape)
data = data[~data.duplicated(keep='first')]
frames = {'data':data, 'demographics':demographics, 'socialhistory':socialhistory}
for key in frames.keys():
frame = frames[key]
if frame.shape[0] != 0:
# pdb.set_trace()
if key == "demographics":
frame.to_sql(f"{folder}_demographics", conn, if_exists="replace", index=False)
elif key == "socialhistory":
frame.to_sql(f"{folder}_socialhistory", conn, if_exists="replace", index=False)
else:
frame.to_sql(f"{folder}", conn, if_exists="replace", index=False)
create_table_from_folder_csvs(death_folder)
create_table_from_folder_csvs(intubation_folder)
create_table_from_folder_csvs(demographics_folder, demographic_table=True)
create_table_from_folder_csvs(covid_test_results_folder)
create_table_from_folder_csvs(chest_imaging_reports_folder)
[2] Load a new csv
This script could load every csv but it much much slower. Use this once a bulk database is made andd you want to add a new csv.
import pandas as pd, os, pdb, sqlite3, numpy as np
pd.set_option('display.min_rows', 25) # How many to show
pd.set_option('display.max_rows', 25) # How many to show
death_folder = 'data_Death'
intubation_folder = 'data_Intubation'
demographics_folder = 'data_demographics'
covid_test_results_folder = 'data_COVID_TestResults'
chest_imaging_reports_folder = 'data_chestImagingReports'
db_dir = 'db'
def load_csv(folder=None, file=None, demographic_table=False):
# Proper input check
if not isinstance(folder, str):
return("Not a valid folder\\table name")
if not isinstance(file, str):
return("Not a valid file")
# DB Connection
conn = sqlite3.connect(os.path.join(db_dir,'covid.db'))
c = conn.cursor()
# Utility
def check_if_table_exists_in_db(table):
c.execute(f''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='{table}' ''')
if c.fetchone()[0]==1 :
print('Table exists.')
return(True)
else:
print('Table does not exist.')
return(False)
### table name
table = folder
if demographic_table:
if file.find('Demographics') != -1:
table = folder+'_demographic'
elif file.find('SocialHistory') != -1:
table = folder+'_socialhistory'
# Load Data
csv = pd.read_csv(os.path.join(folder, file))
# Create Table if it doesn't exist
if not check_if_table_exists_in_db(table):
print("creating table")
schema = ''
for col in csv.columns:
type_key = {'object':'TEXT', 'int64':'INTEGER', 'float64':'FLOAT'}
schema += f"'{col}' {type_key[str(csv.dtypes[col])]}, "
schema = schema[0:-2]
c.execute(f""" create table '{table}' ({schema})""")
def check_row_against_table(df_row, table=table ,conn=conn):
# Here I had to pass in conn as it wasn't able to be seen otherwise.
# This is because this is apart of the "apply" function in pandas (probably some sort of scoping issue)
# See "csv.apply(check_row_against_table, axis=1)" below this function definition
print(df_row) # Optional and makes things slower, but it's a good visual
where_filter = ''
insert_statements = ''
for col in df_row.keys():
if pd.isnull(df_row[col]) == True:
# Build element for queries escaping single quotes in text string for sqlite
where_filter += f"{col} is null and "
insert_statements += f"null, "
else:
value = str(df_row[col]).replace('\'','\'\'') # escape quotes for use in sqlite
where_filter += f"{col} = '{value}' and "
insert_statements += f"'{value}', "
# pdb.set_trace()
# Trim commas
where_filter = where_filter[0:-5] # trimming " and "
insert_statements = insert_statements[0:-2] # trimming ", "
# Execute search for current record
query = f'''select * from {table} where {where_filter};'''
c.execute(query)
# c.execute("select * from data_Death where MRN = 'not a chance'") # Use this for debugging
try:
# Record is a duplicate if it doesn't throw exception
original_record = c.fetchone()[0]
except:
# Record is brand new
# pdb.set_trace()
c.execute(f"insert into {table} values ({insert_statements});")
# commit the changes to db
conn.commit()
# Loop over every row in csv and call above function
csv.apply(check_row_against_table, axis=1)
#close the connection
conn.close()
# #xample executions:
# data_Death
load_csv(folder=death_folder, file='Mortality_20200414110000.csv')
load_csv(folder=death_folder, file='Mortality_20200423110000.csv')
load_csv(folder=death_folder, file='Mortality_20200417110000.csv')
load_csv(folder=death_folder, file='Mortality_20200422110000.csv')
load_csv(folder=death_folder, file='Mortality_20200420090000.csv')
load_csv(folder=death_folder, file='Mortality_20200421110000.csv')
load_csv(folder=death_folder, file='Mortality_20200424080000.csv')
load_csv(folder=death_folder, file='Mortality_20200418090000.csv')
load_csv(folder=death_folder, file='Mortality_20200419090000.csv')
load_csv(folder=death_folder, file='Mortality_20200416090000.csv')
load_csv(folder=death_folder, file='Mortality_20200415090000.csv')
# data_Intubation
load_csv(folder=intubation_folder, file='Intubation_20200421090000.csv')
load_csv(folder=intubation_folder, file='Intubation_20200418180000.csv')
load_csv(folder=intubation_folder, file='Intubation_20200420180000.csv')
load_csv(folder=intubation_folder, file='Intubation_20200416100000.csv')
load_csv(folder=intubation_folder, file='Intubation_20200423110000.csv')
load_csv(folder=intubation_folder, file='Intubation_20200422090000.csv')
load_csv(folder=intubation_folder, file='Intubation_20200415100000.csv')
load_csv(folder=intubation_folder, file='Intubation_20200424080000.csv')
load_csv(folder=intubation_folder, file='Intubation_20200414160000.csv')
load_csv(folder=intubation_folder, file='Intubation_20200417100000.csv')
load_csv(folder=intubation_folder, file='Intubation_20200419180000.csv')
# data_COVID_TestResults
load_csv(folder=covid_test_results_folder, file='COVID_testResults_20200421090000.csv')
load_csv(folder=covid_test_results_folder, file='CVOID_testResults_20200423110000.csv')
load_csv(folder=covid_test_results_folder, file='COVID_testResults_20200415180000.csv')
load_csv(folder=covid_test_results_folder, file='COVID_testResults_20200418180000.csv')
load_csv(folder=covid_test_results_folder, file='COVID_testResults_20200416180000.csv')
load_csv(folder=covid_test_results_folder, file='COVID_testResults_20200422090000.csv')
load_csv(folder=covid_test_results_folder, file='COVID_testResults_20200417180000.csv')
load_csv(folder=covid_test_results_folder, file='COVID_testResults_20200420180000.csv')
load_csv(folder=covid_test_results_folder, file='CVOID_testResults_20200424080000.csv')
load_csv(folder=covid_test_results_folder, file='COVID_testResults_20200419180000.csv')
load_csv(folder=covid_test_results_folder, file='COVID_testResults_20200414160000.csv')
# data_chestImagingReports
load_csv(folder=chest_imaging_reports_folder, file='XRChest_20200421090000.csv')
load_csv(folder=chest_imaging_reports_folder, file='XRChest_20200417100000.csv')
load_csv(folder=chest_imaging_reports_folder, file='XRChest_20200414110000.csv')
load_csv(folder=chest_imaging_reports_folder, file='XRChest_20200422090000.csv')
load_csv(folder=chest_imaging_reports_folder, file='XRChest_20200419180000.csv')
load_csv(folder=chest_imaging_reports_folder, file='XRChest_20200424080000.csv')
load_csv(folder=chest_imaging_reports_folder, file='XRChest_20200423110000.csv')
load_csv(folder=chest_imaging_reports_folder, file='XRChest_20200416090000.csv')
load_csv(folder=chest_imaging_reports_folder, file='XRChest_20200418160000.csv')
load_csv(folder=chest_imaging_reports_folder, file='XRChest_20200420180000.csv')
load_csv(folder=chest_imaging_reports_folder, file='XRChest_20200415110000.csv')
# data_demographics
load_csv(folder=demographics_folder, file='RIC_ED_visits_Demographics_20200415110000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='RIC_ED_visits_Demographics_20200420180000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='Inpatient_ADT_SocialHistory_20200418160000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='RIC_ED_visits_Demographics_20200414110000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='Inpatient_ADT_Demographics_20200415180000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='Inpatient_ADT_SocialHistory_20200417180000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='RIC_ED_visits_SocialHistory_20200418160000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='Inpatient_ADT_SocialHistory_20200415180000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='RIC_ED_visits_SocialHistory_20200421090000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='RIC_ED_visits_Demographics_20200418160000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='Inpatient_ADT_SocialHistory_20200416180000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='RIC_ED_visits_Demographics_20200423110000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='Inpatient_ADT_Demographics_20200416090000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='RIC_ED_visits_SocialHistory_20200423110000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='Inpatient_ADT_Demographics_20200420180000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='Inpatient_ADT_Demographics_20200419180000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='Inpatient_ADT_SocialHistory_20200422090000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='RIC_ED_visits_Demographics_20200415180000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='Inpatient_ADT_SocialHistory_20200423110000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='RIC_ED_visits_Demographics_20200417140000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='Inpatient_ADT_Demographics_20200418160000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='Inpatient_ADT_SocialHistory_20200414110000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='RIC_ED_visits_SocialHistory_20200416180000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='RIC_ED_visits_Demographics_20200422090000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='Inpatient_ADT_Demographics_20200423110000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='Inpatient_ADT_Demographics_20200415110000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='Inpatient_ADT_SocialHistory_20200420180000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='RIC_ED_visits_SocialHistory_20200414110000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='Inpatient_ADT_Demographics_20200414110000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='RIC_ED_visits_Demographics_20200419180000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='Inpatient_ADT_SocialHistory_20200421090000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='Inpatient_ADT_Demographics_20200422090000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='RIC_ED_visits_SocialHistory_20200417180000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='RIC_ED_visits_Demographics_20200416090000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='RIC_ED_visits_SocialHistory_20200420180000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='RIC_ED_visits_SocialHistory_20200415180000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='RIC_ED_visits_Demographics_20200421090000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='Inpatient_ADT_Demographics_20200421090000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='RIC_ED_visits_SocialHistory_20200419180000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='Inpatient_ADT_SocialHistory_20200419180000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='Inpatient_ADT_Demographics_20200417140000.csv', demographic_table=True)
load_csv(folder=demographics_folder, file='RIC_ED_visits_SocialHistory_20200422090000.csv', demographic_table=True)