Baptiste Vasey 24-07-2019 to 10-04-2023
Jupyter Notebook v6.0.0 Python v3.7.4 DBeaver v6.2.0 (SQL 8+)
Data source: Hospital Alerting Via Electronic Noticeboard (HAVEN) - Oxford University Hospitals NHS Trust
Dependency: vitals_labs_POCTS.sql (see suppl. file IV.2)
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
import pymysql
import mysql.connector
import os
import sys
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import datetime
import random
import math
from collections import Counter
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import quantile_transform
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import GroupKFold
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import kneighbors_graph
from sklearn.neighbors import NearestCentroid
from sklearn import metrics
from scipy import linalg
from scipy.spatial.distance import cdist
from scipy.sparse.linalg import expm
from scipy.cluster import hierarchy
from sklearn.cluster import AgglomerativeClustering
import pickle
import itertools
import clusteval
import statistics
import keras
from keras import layers
from tensorflow import random
class Population:
description = "population description"
string = "SQL command"
name = "name for file"
# all inpatient surgical admissions between 01.01.2016 and 31.12.2017, whose first theatre visit had a treatment function code in (100, 104, 105, 106, 301, 306) + general surgery in OR description
pop4 = Population()
pop4.description = "All visceral surgery patient"
pop4.string = """
SELECT DISTINCT first_tv.hadm_id
FROM (
SELECT A.*
FROM havenouh_v15.theatre_visits as A
INNER JOIN (
SELECT hadm_id, min(anaesthesia_time) as anaesthesia_time
FROM havenouh_v15.theatre_visits
GROUP BY hadm_id) as B
ON A.hadm_id = B.hadm_id
AND A.anaesthesia_time = B.anaesthesia_time
WHERE A.main_specialty = "General Surgery"
ORDER BY A.hadm_id
) as first_tv
LEFT JOIN havenouh_v15.admissions as a
ON first_tv.hadm_id = a.hadm_id
LEFT JOIN havenouh_v15.consultants as c
ON first_tv.episode_id = c.episode_id
WHERE ((c.treatment_function_code in (100, 104, 105, 106, 301, 306)) OR (c.treatment_function_code = 180 AND c.specialty_code = 100))
AND (first_tv.main_specialty = 'General Surgery')
AND (to_days(a.admission_time)<>to_days(a.discharge_time))
AND (year(a.admission_time) BETWEEN 2016 and 2017)
ORDER BY first_tv.hadm_id
"""
pop4.name = "visceral_all"
print('populations ready')
def extraction(population):
# connect to database
mydb = mysql.connector.connect(
host="mysql8",
user= os.getenv('USERNAMEH'),
passwd= os.getenv('PASSWORDH'),
database="havenouh_v15"
)
# import a database of vital signs, labs and POCT for all surgical admission with at least one visit to theatre, between 2016 and 2017
query1 = """
SELECT vlp.*,
co.subject_id, co.gender, co.age, co.hospital_code, co.treatment_function_code, co.admission_time, co.discharge_time, co.cci, co.weight, co.BMI,
f.OP_start_time AS firstOP_start_time, f.OP_end_time AS firstOP_end_time
FROM user_baptiste_vasey.vitals_labs_POCTs vlp
LEFT JOIN user_baptiste_vasey.cohort co
ON vlp.hadm_id = co.hadm_id
LEFT JOIN user_baptiste_vasey.first_OP f
ON vlp.hadm_id = f.hadm_id
WHERE vlp.hadm_id IN (""" + population.string + """)
ORDER BY vlp.hadm_id, vlp.obstime;"""
df_OP_inpat = pd.read_sql(query1, mydb)
# import a database of theatre visits for the selected hospital admissions
query2 = """
SELECT hadm_id, start_time, end_time, theatre, main_specialty
FROM havenouh_v15.theatre_visits
WHERE hadm_id IN (""" + population.string + """)
ORDER BY hadm_id;"""
df_list_OP = pd.read_sql(query2, mydb)
# set OP end time with the value of OP start time in the cases where the value is missing
df_list_OP.loc[df_list_OP['end_time'].isnull(),'end_time'] = df_list_OP[df_list_OP['end_time'].isnull()]['start_time']
mydb.close()
# create a column time_since_hadm giving the time delta since admission
df_OP_inpat['time_since_hadm'] = df_OP_inpat['obstime'] - df_OP_inpat['admission_time']
# first operation end time is claculated in SQl server: see df_OP_inpat['firstOP_end_time']
# set first OP end time with the value of first OP start time in the cases where the value is missing
df_OP_inpat.loc[df_OP_inpat['firstOP_end_time'].isnull(),'firstOP_end_time'] = df_OP_inpat[df_OP_inpat['firstOP_end_time'].isnull()]['firstOP_start_time']
# get last operation end time
df_OP_inpat['lastOP_end_time'] = np.datetime64('NaT')
def get_lastOP_end_time(row):
hadm_id = row['hadm_id']
obs_time = row['obstime']
if obs_time >= row['firstOP_end_time']:
last_op = df_list_OP[(df_list_OP['hadm_id'] == hadm_id) & (df_list_OP['end_time'] < obs_time)]['end_time'].max()
row.loc['lastOP_end_time'] = last_op
# so observations taken before the first operation also have a value for 'lastOP_end_time'
else:
row.loc['lastOP_end_time'] = row['firstOP_end_time']
return row
df_OP_inpat=df_OP_inpat.apply(lambda x: get_lastOP_end_time(x), axis=1)
# create a column post_firstOP_time giving the time delta since the first operation
df_OP_inpat.loc[:,'post_firstOP_time'] = df_OP_inpat['obstime'] - df_OP_inpat['firstOP_end_time']
# transform the post first OP time into either deciles (to correct for markedly skewed data) or number of hours
df_OP_inpat['post_firstOP_time_deciles'] = pd.qcut(df_OP_inpat['post_firstOP_time'], 10, labels = False, duplicates = 'drop') +1
df_OP_inpat['post_firstOP_time_hours'] = df_OP_inpat['post_firstOP_time']/np.timedelta64(1, 'h')
# create a column post_lastOP_time giving the time delta since the last operation
df_OP_inpat.loc[:,'post_lastOP_time'] = df_OP_inpat['obstime'] - df_OP_inpat['lastOP_end_time']
# transform the post last OP time into either deciles (to correct for markedly skewed data) or number of hours
df_OP_inpat['post_lastOP_time_deciles'] = pd.qcut(df_OP_inpat['post_lastOP_time'], 10, labels = False, duplicates = 'drop') +1
df_OP_inpat['post_lastOP_time_hours'] = df_OP_inpat['post_lastOP_time']/np.timedelta64(1, 'h')
# create a column interval giving the time interval since the last recording of the same type (but not necessarily the same variable)
df_OP_inpat.loc[df_OP_inpat['charttime'].notnull(),'interval'] = df_OP_inpat[df_OP_inpat['charttime'].notnull()][['hadm_id','time_since_hadm']].groupby(['hadm_id']).diff().loc[:,'time_since_hadm']
df_OP_inpat.loc[df_OP_inpat['labtime'].notnull(),'interval'] = df_OP_inpat[df_OP_inpat['labtime'].notnull()][['hadm_id','time_since_hadm']].groupby(['hadm_id']).diff().loc[:,'time_since_hadm']
df_OP_inpat.loc[df_OP_inpat['collection_time'].notnull(),'interval'] = df_OP_inpat[df_OP_inpat['collection_time'].notnull()][['hadm_id','time_since_hadm']].groupby(['hadm_id']).diff().loc[:,'time_since_hadm']
# convert sex F/M into 0/1
df_OP_inpat.loc[df_OP_inpat['gender'] == 'F', 'gender'] = 0
df_OP_inpat.loc[df_OP_inpat['gender'] == 'M', 'gender'] = 1
#order the colmns
df_OP_inpat=df_OP_inpat[['row_num', 'hadm_id', 'obstime', 'charttime', 'labtime', 'collection_time', 'location_id',
'HR', 'RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU_RAW', 'GCS', 'AVPU',
'MASKTYPE_ID', 'FIO2_LMIN', 'FIO2_PERC', 'O2TH', 'ESTIMATED_FIO2', 'ESTIMATED_FIO2_2', 'BATEMAN_FIO2',
'ALB', 'ALBUR', 'ALP', 'ALT', 'APTT', 'AST', 'BAS', 'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP', 'COCA',
'DDIM', 'EGFR', 'EOS', 'GLU', 'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM', 'MCH', 'MCHC', 'MCV', 'METHB',
'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD', 'TBIL', 'TROP', 'UR', 'URUR', 'WBC',
'subject_id', 'gender', 'age', 'weight', 'BMI', 'cci', 'hospital_code', 'treatment_function_code',
'admission_time', 'discharge_time', 'firstOP_start_time', 'firstOP_end_time', 'lastOP_end_time',
'time_since_hadm', 'post_firstOP_time', 'post_firstOP_time_hours', 'post_lastOP_time', 'post_lastOP_time_hours', 'interval']]
#SQL data extraction patch
df_OP_inpat['BASEX'].replace('.....', np.nan, inplace=True)
for col in ['FIO2_PERC', 'ALB', 'ALBUR', 'ALP', 'ALT', 'APTT', 'AST', 'BAS', 'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP', 'COCA',
'DDIM', 'EGFR', 'EOS', 'GLU', 'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM', 'MCH', 'MCHC', 'MCV', 'METHB',
'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD', 'TBIL', 'TROP', 'UR', 'URUR', 'WBC']:
df_OP_inpat.loc[:,col]=df_OP_inpat[col].astype(float)
return df_OP_inpat
print('extraction function ready')
# 1 = pre first OP - on the ward
# 2 = post first OP - on the ward after recovery time
# 3 = in ICU (CICU: 17, CCU: 39, Adult ICU: 87, CTV ICU: 95, Neuro ICU: 1209
# 4 = in operative suite (Women's Day surgery and Diagnostic Suite: 35, Minor Ops Treatment room: 1228, Cardiac Angiography Suite: 58, Delivery Suite: 40 & 96, Cardiac Cath Lab E: 63,
# Cardiac Cath Lab A: 59, Cardiac Cath Lab B: 60, Cardiac Cath Lab D: 62, AVIC Cath Lab: 55, West Wing Main Theatre Suite Recovery: 148 & 25, Cardiac Angiography Suite Recovery: 336,
# 5 = in nursing home (Longlands: 1646, Green Pastures: 1644, Banbury Heights: 1641, Heathfield House: 1645, Triangle: 1648 & 1649, Albany: 1640, Freeland: 1666, The Close: 1657,
# Southerndown: 1667, The Ramping Cat: 1647, Acute Hospital at Home: 1710, Wyndham Hall: 1714 & 1717, St Lukes: 1656)
# 6 = on ward postoperative recovery (time window can be adapted through the recovery variable)
# 7 = errors
# Define recovery period
recovery = 1
def get_status(row, df_ICU, df_OP):
if row['location_id'] in [17, 39, 87, 95, 1209] or row['row_num'] in df_ICU['row_num'].values:
return 3
elif row['location_id'] in [25, 35, 40, 55, 58, 59, 60, 62, 63, 96, 148, 336, 1228] or row['row_num'] in df_OP['row_num'].values:
return 4
elif row['location_id'] in [1640, 1641, 1644, 1645, 1646, 1647, 1648, 1649, 1656, 1657, 1666, 1667, 1710, 1714, 1717]:
return 5
# adapt recovery period in hours
elif row['obstime'] > row['firstOP_end_time'] and row['obstime'] <= (row['firstOP_end_time'] + datetime.timedelta(hours=recovery)):
return 6
elif row['obstime'] >= (row['admission_time'] - datetime.timedelta(hours=24)) and row['obstime'] < row['firstOP_start_time']:
return 1
# post recovery period in hours (must match hours under number 6)
elif row['obstime'] > (row['firstOP_end_time'] + datetime.timedelta(hours=recovery)) and row['obstime'] <= (row['discharge_time'] + datetime.timedelta(hours=24)):
return 2
else:
return 7
print('status function ready')
# fetch lab and POCT value in the past within a <fetch_hours> hours time window; consider the last value of intraoperative, recovery and postop ward.
def fetch_labs(df_fetch, df_raw, fetch_hours):
def find_obs(x, interval, df):
time = x['obstime']
i = x['hadm_id']
df_sub=df[(df.hadm_id == i) & (df['obstime'] <= time) & ((time - df['obstime']) <= datetime.timedelta(hours=interval)) & (df['status'].isin([2, 4, 6]))]
def across_features(x, i, time):
last_index = x.last_valid_index()
if last_index != None:
last = df.loc[last_index,x.name]
else:
last = np.nan
return last
x[['ALB', 'ALBUR', 'ALP', 'ALT', 'APTT', 'AST', 'BAS', 'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP', 'COCA',
'DDIM', 'EGFR', 'EOS', 'GLU', 'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM', 'MCH', 'MCHC', 'MCV', 'METHB',
'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD', 'TBIL', 'TROP', 'UR', 'WBC']] = df_sub[['ALB', 'ALBUR',
'ALP', 'ALT', 'APTT', 'AST', 'BAS', 'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP', 'COCA',
'DDIM', 'EGFR', 'EOS', 'GLU', 'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM', 'MCH', 'MCHC', 'MCV', 'METHB',
'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD', 'TBIL', 'TROP', 'UR', 'WBC']].apply(across_features, args=(i,time))
return x
df_fetch=df_fetch.apply(find_obs, args=(fetch_hours,df_raw), axis=1)
return df_fetch
print('fetch past lab function ready')
# impute missing value for vital signs
def vitals_complete (df_complete):
for var in ['HR','RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU', 'FIO2_LMIN', 'ESTIMATED_FIO2_2']:
df_complete[var]=df_complete.groupby('hadm_id')[var].apply(lambda x: x.fillna(x.interpolate()))
# to fix the abscence of interpolation if first value missing
df_complete[var]=df_complete.groupby('hadm_id')[var].apply(lambda x: x.fillna(x.bfill()))
# to impute any remaining missing values
df_complete[var]=df_complete[var].fillna(df_complete[var].median())
return df_complete
print('vital signs imputation function ready')
def get_NEWS(dfs):
def score_RR(value):
if 21<= value <25:
score = 2
elif value >=25:
score = 3
elif 9<= value <=11:
score = 1
elif value <9:
score = 3
else:
score = 0
return score
def score_SPO2(value):
if 94<= value <=95:
score = 1
elif 92<= value <=93:
score = 2
elif value <=91:
score = 3
else:
score = 0
return score
def score_FIO2_LMIN(value):
if value >0:
score = 2
else:
score = 0
return score
def score_SBP(value):
if value >=220:
score = 3
elif 101<= value <=110:
score = 1
elif 91<= value <=100:
score = 2
elif value <=90:
score = 3
else:
score = 0
return score
def score_HR(value):
if value >=131:
score = 3
elif 111<= value <=130:
score = 2
elif 91<= value <=110:
score = 1
elif 41<= value <=50:
score = 1
elif value <=40:
score = 3
else:
score = 0
return score
def score_AVPU(value):
if value >1:
score = 3
else:
score = 0
return score
def score_TEMP(value):
if value >=39.1:
score = 2
elif 38.1<= value <=39:
score = 1
elif 35.1<= value <=36:
score = 1
elif value <=35:
score = 3
else:
score = 0
return score
def alarm(df):
if df['score_total'] >= 5:
score = 1
elif df['score_RR'] == 3 or df['score_SPO2'] == 3 or df['score_SBP'] == 3 or df['score_HR'] == 3 or df['score_AVPU'] == 3 or df['score_TEMP'] == 3:
score = 1
else:
score = 0
return score
dfs['score_RR'] = dfs['RR'].apply(lambda x: score_RR(x))
dfs['score_SPO2'] = dfs['SPO2'].apply(lambda x: score_SPO2(x))
dfs['score_FIO2_LMIN'] = dfs['FIO2_LMIN'].apply(lambda x: score_FIO2_LMIN(x))
dfs['score_SBP'] = dfs['SBP'].apply(lambda x: score_SBP(x))
dfs['score_HR'] = dfs['HR'].apply(lambda x: score_HR(x))
dfs['score_AVPU'] = dfs['AVPU'].apply(lambda x: score_AVPU(x))
dfs['score_TEMP'] = dfs['TEMP'].apply(lambda x: score_TEMP(x))
dfs['score_total'] = dfs.apply(lambda row: row.score_RR + row.score_SPO2 + row.score_FIO2_LMIN + row.score_SBP + row.score_HR + row.score_AVPU + row.score_TEMP, axis = 1)
dfs['alarm'] = dfs.apply(lambda row: alarm(row), axis = 1)
return dfs
print('get NEWS function ready')
def track_missingness(df):
selected_lab_var = ['ALB', 'ALBUR', 'ALP', 'ALT', 'APTT', 'AST', 'BAS', 'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP','COCA', 'DDIM', 'EGFR', 'EOS', 'GLU',
'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM', 'MCH', 'MCHC', 'MCV', 'METHB', 'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD',
'TBIL', 'TROP', 'UR', 'WBC']
for var in selected_lab_var:
var0 = var + '_miss'
# create new columns tracking initial missigness
df[var0]=np.nan
# 1 if present, 0 if missing
df[var0]=df.apply(lambda x: 0 if np.isnan(x[var]) else 1, axis = 1)
return df
print('missingness function ready')
def impute_labs(df_complete, df_raw):
# select imputation method (random_normal, pop_median, perso_median)
impute_method = 'perso_median'
# select input lab variables
selected_lab_var = ['ALB', 'ALBUR', 'ALP', 'ALT', 'APTT', 'AST', 'BAS', 'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP','COCA', 'DDIM', 'EGFR', 'EOS', 'GLU',
'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM', 'MCH', 'MCHC', 'MCV', 'METHB', 'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD',
'TBIL', 'TROP', 'UR', 'WBC']
# only consider pre and post op ward values
df_raw = df_raw[df_raw['status'].isin([1, 2, 6])]
# from https://www.ouh.nhs.uk/biochemistry/tests/documents/biochemistry-reference-ranges.pdf
# and https://www.ouh.nhs.uk/services/departments/laboratory-medicine/haematology-laboratories/documents/haematology-handbook.pdf
# and https://www.nhs.uk/conditions/kidney-disease/diagnosis/
# and https://oxfordmedicine.com/view/10.1093/med/9780198703860.001.0001/med-9780198703860-appendix-1
# and https://www.ouh.nhs.uk/biochemistry/tests/tests-catalogue/methaemoglobin.aspx
# and https://www.nhs.uk/conditions/diabetic-ketoacidosis/
# normal distribution assuming a mean of lower range + (upper range-lower range)/2 and a SD of (mean-lower range)/2 => 2SD correpsonding to a 95% confidence interval, which is also the most common definition of the normal range
lab_references = {
'ALB' : [41.5, 4.75], #np.arange(32, 51, 1, dtype='float'), # g/L
'ALBUR' : [1.8, 0.9], #np.arange(0, 3.6, 0.1, dtype='float'), # mg/mmol (0 to 2.5 mg/mmol for male)
'ALP' : [80.5, 25.25], #np.arange(30, 131, 1, dtype='float'), # IU/l (age dependent)
'ALT' : [28, 9], #np.arange(10, 46, 1, dtype='float'), # IU/l
'APTT' : [25.5, 2.75], #np.arange(20, 31, 1, dtype='float'), # sec
'AST' : [29, 7], #np.arange(15, 43, 1, dtype='float'), # IU/L
'BAS' : [0.065, 0.0225], #np.arange(0.02, 0.11, 0.01, dtype='float'), # 10^9/l
'BASEX' : [0.5, 1.525], #np.arange(-3.0, 3.1, 0.1, dtype='float'), # mmol/L
'BICAR' : [27, 2.5], #np.arange(22, 32, 1, dtype='float'), # mmol/L
'CAL' : [1.205, 0.0525], #np.arange(1.1, 1.31, 0.01, dtype='float'), # mmol/l
'CL' : [100.5, 2.75], #np.arange(95, 106, 1, dtype='float'), # mmol/L
'CR' : [69.5, 10.25], #np.arange(49, 90, 1, dtype='float'), # µmol/L
'CRP' : [3, 1.5], #np.arange(0, 6, 1, dtype='float'), # mg/l
'COCA' : [2.405, 0.1025], #np.arange(2.2, 2.61, 0.01, dtype='float'), # mmol/l
'DDIM' : [250.5, 125.25], #np.arange(0, 501, 10, dtype='float'), # µg/l FEU
'EGFR' : [105.5, 7.75], #np.arange(90, 121, 1, dtype='float'), # ml/min (upper bound arbitrarily set for computational reasons)
'EOS' : [0.265, 0.1225], #np.arange(0.02, 0.51, 0.01, dtype='float'), # 10^9/l
'GLU' : [6.55, 1.275], #np.arange(4, 9.1, 0.1, dtype='float'), # caveat, this assume healthy patients or patients with controlled diabetes, with blood taken either before eating or 2h postprandial
'HCT' : [0.435, 0.0375], #np.arange(0.36, 0.51, 0.01, dtype='float'), # range male: 0.40-0.50, female:0.36-0.46
'HGB' : [14.6, 1.28], #np.arange(120, 171, 1, dtype='float'), # g/dl range male: 13.0-17.0, female: 12.0-15.0
'INR' : [1, 0.2], #np.arange(0.6, 1.4, 0.1, dtype='float'), # no unit (ratio)
'KET' : [0.3, 0.15], #np.arange(0, 0.6, 0.1, dtype='float'), # mmol/l
'LAC' : [1.3, 0.4], #np.arange(0.5, 2.1, 0.1, dtype='float'), # mmol/L
'LYM' : [2.55, 0.775], #np.arange(1, 4.1, 0.1, dtype='float'), # 10^9/l
'MCH' : [29.55, 1.275], #np.arange(27, 32.1, 0.1, dtype='float'), # pg
'MCHC' : [330.5, 7.75], #np.arange(315, 346, 1, dtype='float'), # g/l
'MCV' : [92, 4.5], #np.arange(83, 101, 1, dtype='float'), # fl
'METHB' : [0.8, 0.4], #np.arange(0, 1.6, 0.1, dtype='float'), # %
'MON' : [0.605, 0.2025], #np.arange(0.2, 1.01, 0.01, dtype='float'), # 10^9/l
'MPV' : [10.55, 0.775], #np.arange(9, 12.1, 0.1, dtype='float'), # fl
'NEU' : [4.55, 1.275], #np.arange(2, 7.1, 0.1, dtype='float'), # 10^9/l
'OSM' : [287, 4.5], #np.arange(278, 296, 1, dtype='float'), # mosmol/kg
'PH' : [7.405, 0.0175], #np.arange(7.37, 7.44, 0.01, dtype='float'), # no unit
'PLT' : [275.5, 62.75], #np.arange(150, 401, 1, dtype='float'), # 10^9/l
'POT' : [4.3, 0.4], #np.arange(3.5, 5.1, 0.1, dtype='float'), # mmol/L
'RBC' : [4.7, 0.45], #np.arange(3.8, 5.6, 0.1, dtype='float'), # 10^12/l (male range:4.5-5.5, female range:3.8-4.8)
'SOD' : [140.5, 2.75], #np.arange(135, 146, 1, dtype='float'), # mmol/l
'TBIL' : [11, 5.5], #np.arange(0, 22, 1, dtype='float'), # µmol/L
'TROP' : [17.5, 8.75], #np.arange(0, 35, 1, dtype='float'), # ng/L (male:0-34 ng/L, female: 0-17 ng/L)
'UR' : [5.9, 1.7], #np.arange(2.5, 9.3, 0.1, dtype='float'), # mmol/l (age and sex specific)
'URUR' : [5.9, 1.7], #np.arange(2.5, 9.3, 0.1, dtype='float'), # mmol/l (age and sex specific)
'WBC' : [7.55, 1.775], #np.arange(4, 11.1, 0.1, dtype='float') # 10^9/l
}
def personal_median(row, var, df_ref):
time = row['obstime']
hadm = row['hadm_id']
median = df_ref[(df_ref.hadm_id == hadm) & (df_ref['obstime'] <= time)][var].median()
return median
if impute_method == 'random_normal':
np.random.seed(2206)
for var in selected_lab_var:
# impute lab value based on a random draw from the physiological range
df_complete[var] = df_complete[var].apply(lambda x: round(np.random.normal(loc=lab_references[var][0], scale=lab_references[var][1]),2) if np.isnan(x) else x)
# set negative values to 0
df_complete[var] = df_complete[var].apply(lambda x: x if x>0 else 0)
elif impute_method == 'pop_median':
for var in selected_lab_var:
# impute lab value based on the population median
df_complete[var]=df_complete[var].fillna(df_complete[var].median())
elif impute_method == 'perso_median':
selected_lab_var.remove('BASEX') # allow distribution of BASEX in negative value
for var in selected_lab_var:
# impute lab value based on the admission non-missing anterior values median if any available including pre op
df_complete[var]=df_complete.apply(lambda x: personal_median(x, var, df_raw) if np.isnan(x[var]) else x[var], axis = 1)
# impute remaining missing values with random draw from the physiological range
df_complete[var] = df_complete[var].apply(lambda x: round(np.random.normal(loc=lab_references[var][0], scale=lab_references[var][1]),2) if np.isnan(x) else x)
# set negative values to 0
df_complete[var] = df_complete[var].apply(lambda x: x if x>0 else 0)
# tailored imputation for 'BASEX' (allow distribution of BASEX in negative value)
# impute lab value based on the admission non-missing anterior values median if any available
df_complete['BASEX']=df_complete.apply(lambda x: personal_median(x, 'BASEX', df_raw) if np.isnan(x['BASEX']) else x['BASEX'], axis = 1)
# impute remaining missing values with random draw from the physiological range
df_complete['BASEX'] = df_complete['BASEX'].apply(lambda x: round(np.random.normal(loc=lab_references['BASEX'][0], scale=lab_references['BASEX'][1]),2) if np.isnan(x) else x)
# set max EGFR value to 90
df_complete['EGFR'] = df_complete['EGFR'].apply(lambda x: x if x<=90 else 90)
else:
print("please define correct imputation method")
return df_complete
print('labs and POCT imputation function ready')
def min_max(df, df_support):
selected_var = ['HR','RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU', 'FIO2_LMIN', 'ESTIMATED_FIO2_2']
df_support = df_support[df_support['status'].isin([2, 6])]
def amplitude_24(row, var, df_support):
time = row['obstime']
hadm = row['hadm_id']
# fetch 24h window in df_raw, only post recovery ward data
df_prov = df_support[(df_support['hadm_id'] == hadm) & (df_support['obstime'] > (time - datetime.timedelta(hours=24))) & (df_support['obstime'] <= time) & (df_support['status'] == 2)]
# to diferentiate true 0 difference from time window with only 1 datapoint
if np.sum(df_prov[var].count())<=1:
minmax = np.nan
else:
minmax = df_prov[var].max() - df_prov[var].min()
return minmax
def impute_amplitude_24(row, var, df_support):
time = row['obstime']
hadm = row['hadm_id']
# get hadm_id median in df_raw, only ward data
df_prov = df_support[(df_support['hadm_id'] == hadm) & (df_support['obstime'] < time)]
# if median is not nan, set minmax as absolute value of difference between value and median of previous observation
if np.isnan(df_prov[var].median()) == False:
minmax = abs(row[var] - df_prov[var].median())
else:
minmax = 0
return minmax
for var in selected_var:
var0 = var + '_minmax'
# create new columns looking at maximal amplitude in the last 24 hours
df[var0]=np.nan
# min-max amplitude in the last 24h
df[var0]=df.apply(lambda x: amplitude_24(x, var, df_support), axis = 1)
# impute difference from personal median if minmax = nan
df[var0]=df.apply(lambda x: impute_amplitude_24(x, var, df_support) if np.isnan(x[var0]) else x[var0], axis = 1)
return df
print('min-max function ready')
def delta_vitals(df, df_support):
selected_var_vitals = ['HR','RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU', 'FIO2_LMIN', 'ESTIMATED_FIO2_2']
df_support_vital1 = df_support[(df_support['charttime'].notnull()) & (df_support['status']==2)]
df_support_vital2 = df_support[(df_support['charttime'].notnull()) & (df_support['status'].isin([1, 2]))]
# delta column for vital signs values
for var in selected_var_vitals:
var0 = var + '_delta'
# create new columns looking at changes in 12 hours median value since 6 hours ago.
df[var0]=np.nan
def delta_24(row, var, df_support_vital1, df_support_vital2):
time = row['obstime']
hadm = row['hadm_id']
# fetch difference between 12 hours median and 12h median 6 hours before (only ward post recovery)
df_prov1 = df_support_vital1[(df_support_vital1['hadm_id'] == hadm) & (df_support_vital1['obstime'] > (time - datetime.timedelta(hours=12))) & (df_support_vital1['obstime'] <= time)]
df_prov1_delta6 = df_support_vital1[(df_support_vital1['hadm_id'] == hadm) & (df_support_vital1['obstime'] > (time - datetime.timedelta(hours=18))) & (df_support_vital1['obstime'] <= (time - datetime.timedelta(hours=6)))]
df_median = df_support_vital2[(df_support_vital2['hadm_id'] == hadm) & (df_support_vital2['obstime'] < time)]
for var in selected_var_vitals:
column = var + '_delta'
# fetch hadm_id past median value including pre-op time
median0 = df_prov1[var].median()
median6 = df_prov1_delta6[var].median()
median_hadm = df_median[var].median()
if (np.isnan(median0) == False) & (np.isnan(median6) == False):
row[column] = round((median0 - median6), 2)
elif (np.isnan(median_hadm) == False) & (np.isnan(median0) == False):
# if row is the first observation within 18 hours, compare it to the hospital admission's anterior median (including preOP values)
row[column] = round(median0 - median_hadm, 2)
else:
# if none available, consider delta as 0
row[column] = 0
return row
df=df.apply(lambda x: delta_24(x, selected_var_vitals, df_support_vital1, df_support_vital2), axis = 1)
return df
print('delta vitals function ready')
def delta_labs(df, df_support):
# normal distribution assuming a mean of lower range + (upper range-lower range)/2 and a SD of (mean-lower range)/2 => 2SD correpsonding to a 95% confidence interval, which is also the most common definition of the normal range
lab_references = {
'ALB' : [41.5, 4.75], #np.arange(32, 51, 1, dtype='float'), # g/L
'ALBUR' : [1.8, 0.9], #np.arange(0, 3.6, 0.1, dtype='float'), # mg/mmol (0 to 2.5 mg/mmol for male)
'ALP' : [80.5, 25.25], #np.arange(30, 131, 1, dtype='float'), # IU/l (age dependent)
'ALT' : [28, 9], #np.arange(10, 46, 1, dtype='float'), # IU/l
'APTT' : [25.5, 2.75], #np.arange(20, 31, 1, dtype='float'), # sec
'AST' : [29, 7], #np.arange(15, 43, 1, dtype='float'), # IU/L
'BAS' : [0.065, 0.0225], #np.arange(0.02, 0.11, 0.01, dtype='float'), # 10^9/l
'BASEX' : [0.5, 1.525], #np.arange(-3.0, 3.1, 0.1, dtype='float'), # mmol/L
'BICAR' : [27, 2.5], #np.arange(22, 32, 1, dtype='float'), # mmol/L
'CAL' : [1.205, 0.0525], #np.arange(1.1, 1.31, 0.01, dtype='float'), # mmol/l
'CL' : [100.5, 2.75], #np.arange(95, 106, 1, dtype='float'), # mmol/L
'CR' : [69.5, 10.25], #np.arange(49, 90, 1, dtype='float'), # µmol/L
'CRP' : [3, 1.5], #np.arange(0, 6, 1, dtype='float'), # mg/l
'COCA' : [2.405, 0.1025], #np.arange(2.2, 2.61, 0.01, dtype='float'), # mmol/l
'DDIM' : [250.5, 125.25], #np.arange(0, 501, 10, dtype='float'), # µg/l FEU
'EGFR' : [105.5, 7.75], #np.arange(90, 121, 1, dtype='float'), # ml/min (upper bound arbitrarily set for computational reasons)
'EOS' : [0.265, 0.1225], #np.arange(0.02, 0.51, 0.01, dtype='float'), # 10^9/l
'GLU' : [6.55, 1.275], #np.arange(4, 9.1, 0.1, dtype='float'), # caveat, this assume healthy patients or patients with controlled diabetes, with blood taken either before eating or 2h postprandial
'HCT' : [0.435, 0.0375], #np.arange(0.36, 0.51, 0.01, dtype='float'), # range male: 0.40-0.50, female:0.36-0.46
'HGB' : [14.6, 1.28], #np.arange(120, 171, 1, dtype='float'), # g/dl range male: 13.0-17.0, female: 12.0-15.0
'INR' : [1, 0.2], #np.arange(0.6, 1.4, 0.1, dtype='float'), # no unit (ratio)
'KET' : [0.3, 0.15], #np.arange(0, 0.6, 0.1, dtype='float'), # mmol/l
'LAC' : [1.3, 0.4], #np.arange(0.5, 2.1, 0.1, dtype='float'), # mmol/L
'LYM' : [2.55, 0.775], #np.arange(1, 4.1, 0.1, dtype='float'), # 10^9/l
'MCH' : [29.55, 1.275], #np.arange(27, 32.1, 0.1, dtype='float'), # pg
'MCHC' : [330.5, 7.75], #np.arange(315, 346, 1, dtype='float'), # g/l
'MCV' : [92, 4.5], #np.arange(83, 101, 1, dtype='float'), # fl
'METHB' : [0.8, 0.4], #np.arange(0, 1.6, 0.1, dtype='float'), # %
'MON' : [0.605, 0.2025], #np.arange(0.2, 1.01, 0.01, dtype='float'), # 10^9/l
'MPV' : [10.55, 0.775], #np.arange(9, 12.1, 0.1, dtype='float'), # fl
'NEU' : [4.55, 1.275], #np.arange(2, 7.1, 0.1, dtype='float'), # 10^9/l
'OSM' : [287, 4.5], #np.arange(278, 296, 1, dtype='float'), # mosmol/kg
'PH' : [7.405, 0.0175], #np.arange(7.37, 7.44, 0.01, dtype='float'), # no unit
'PLT' : [275.5, 62.75], #np.arange(150, 401, 1, dtype='float'), # 10^9/l
'POT' : [4.3, 0.4], #np.arange(3.5, 5.1, 0.1, dtype='float'), # mmol/L
'RBC' : [4.7, 0.45], #np.arange(3.8, 5.6, 0.1, dtype='float'), # 10^12/l (male range:4.5-5.5, female range:3.8-4.8)
'SOD' : [140.5, 2.75], #np.arange(135, 146, 1, dtype='float'), # mmol/l
'TBIL' : [11, 5.5], #np.arange(0, 22, 1, dtype='float'), # µmol/L
'TROP' : [17.5, 8.75], #np.arange(0, 35, 1, dtype='float'), # ng/L (male:0-34 ng/L, female: 0-17 ng/L)
'UR' : [5.9, 1.7], #np.arange(2.5, 9.3, 0.1, dtype='float'), # mmol/l (age and sex specific)
'WBC' : [7.55, 1.775], #np.arange(4, 11.1, 0.1, dtype='float') # 10^9/l
}
selected_var_labs = ['ALB', 'ALBUR', 'ALP', 'ALT', 'APTT', 'AST', 'BAS', 'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP','COCA', 'DDIM', 'EGFR', 'EOS', 'GLU',
'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM', 'MCH', 'MCHC', 'MCV', 'METHB', 'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD',
'TBIL', 'TROP', 'UR', 'WBC']
df_support['labPOCT_time'] = df_support['labtime'].combine_first(df_support['collection_time'])
df_support_lab1 = df_support[(df_support['labPOCT_time'].notnull()) & (df_support['status'].isin([2, 4, 6]))]
df_support_lab2 = df_support[(df_support['labPOCT_time'].notnull()) & (df_support['status'].isin([1, 2]))]
# delta column for labs and POCT values
for var in selected_var_labs:
var0 = var + '_delta'
# create new columns looking at changes in 12 hours median value since 6 hours ago.
df[var0]=np.nan
def delta_50(row, var, df_support_lab1, df_support_lab2, lab_references):
time = row['obstime']
hadm = row['hadm_id']
# fetch difference between value and last recorded value within 50h
df_prov1 = df_support_lab1[(df_support_lab1['hadm_id'] == hadm) & (df_support_lab1['obstime'] > (time - datetime.timedelta(hours=50))) & (df_support_lab1['obstime'] <= time)]
df_prov2 = df_support_lab2[(df_support_lab2['hadm_id'] == hadm) & (df_support_lab2['obstime'] < time)]
for var in selected_var_labs:
column = var + '_delta'
# fetch hadm_id past median value including pre-op time
median = df_prov2[var].median()
if (np.sum(df_prov1[var].count())>=2):
last_lab_time1 = time - df_prov1[df_prov1[var].notnull()].iloc[-1]['obstime']
last_lab_time2 = time - df_prov1[df_prov1[var].notnull()].iloc[-2]['obstime']
# makes sure the last value in the table is less than 26 hours old (otherwise the value would be an imputed value)
# and that there are more than 6 hours between 2 measurements (otherwise small variation in measurment accuracy would have disproportionate influence on the value standardised over 24)
if (last_lab_time1/np.timedelta64(1, 'm') <= 1560) & ((last_lab_time1 - last_lab_time2) >= datetime.timedelta(hours=6)):
# difference between current lab value (may have been fetched forward) and the last one recorded within a 50h period
# add a coefficient to scale the difference to a 24 hour time window
delta_time = df_prov1[df_prov1[var].notnull()].iloc[-1]['obstime'] - df_prov1[df_prov1[var].notnull()].iloc[-2]['obstime']
row[column] = round((df_prov1[df_prov1[var].notnull()].iloc[-1][var] - df_prov1[df_prov1[var].notnull()].iloc[-2][var])*(1440/(delta_time/np.timedelta64(1, 'm'))), 2)
# all the following lines assume the change from baseline happened in 24h
elif np.isnan(median) == False:
# difference from admission median
row[column] = round((row[var] - median), 2)
else:
# difference from the mean value of the normal range
baseline = lab_references[var][0]
row[column] = round((row[var] - baseline), 2)
elif np.isnan(median) == False:
# difference from admission median
row[column] = round((row[var] - median), 2)
else:
# difference from the mean value of the normal range
baseline = lab_references[var][0]
row[column] = round((row[var] - baseline), 2)
return row
df=df.apply(lambda x: delta_50(x, selected_var_labs, df_support_lab1, df_support_lab2, lab_references), axis = 1)
return df
print('delta labs function ready')
def get_outcomes(dfo, time_icu, time_arrest, time_death, time_reOP, first_day_procedure, max_procedure_day, max_prescription_time):
mydb = mysql.connector.connect(
host="mysql8",
user= os.getenv('USERNAMEH'),
passwd= os.getenv('PASSWORDH'),
database="havenouh_v15"
)
# list of admissions with unplanned ICU admission with ICU admission times
query10 = """
SELECT DISTINCT ad.hadm_id, icu.admission_time
FROM havenouh_v15.admissions AS ad
LEFT JOIN havenouh_v15.theatre_visits AS tv ON (ad.hadm_id = tv.hadm_id)
LEFT JOIN havenouh_v15.icustays AS icu ON (icu.subject_id = ad.subject_id)
LEFT JOIN havenouh_v15.chartevents AS ch ON (icu.subject_id = ch.subject_id)
WHERE ad.hadm_id IN (""" + pop.string + """)
AND (icu.admission_time BETWEEN ad.admission_time AND ad.discharge_time)
AND TIMESTAMPDIFF(minute, tv.end_time, icu.admission_time) >0
AND (ch.charttime BETWEEN tv.end_time AND icu.admission_time)
AND (icu.admission_type IN ('Unplanned local admission', 'Unplanned transfer in') OR icu.admission_type IS NULL)
ORDER BY ad.hadm_id, icu.admission_time;
"""
df_icu_admissions= pd.read_sql(query10, mydb)
# list of admissions with cardiac arrests after stay on the ward with arrest time
query11 = """
SELECT DISTINCT ad.hadm_id, ar.arrest_time
FROM havenouh_v15.admissions AS ad
LEFT JOIN havenouh_v15.theatre_visits AS tv ON (ad.hadm_id = tv.hadm_id)
LEFT JOIN havenouh_v15.arrestcalls AS ar ON (ar.subject_id = ad.subject_id)
LEFT JOIN havenouh_v15.chartevents AS ch ON (ch.subject_id = ad.subject_id)
WHERE ad.hadm_id IN (""" + pop.string + """)
AND ar.arrest_time BETWEEN ad.admission_time AND ad.discharge_time
AND TIMESTAMPDIFF(minute, tv.end_time, ar.arrest_time) >0
AND (ch.charttime BETWEEN tv.end_time AND ar.arrest_time)
ORDER BY ad.hadm_id, ar.arrest_time;
"""
df_cardiac_arrest = pd.read_sql(query11, mydb)
# list of in hospital mortality after stay on the ward with time of death
query12 = """
SELECT DISTINCT ad.hadm_id, ad.discharge_time
FROM havenouh_v15.admissions AS ad
LEFT JOIN havenouh_v15.theatre_visits AS tv ON (ad.hadm_id = tv.hadm_id)
LEFT JOIN havenouh_v15.chartevents AS ch ON (ch.subject_id = ad.subject_id)
WHERE ad.hadm_id IN (""" + pop.string + """)
AND ch.charttime BETWEEN tv.end_time AND ad.discharge_time
AND ad.discharge_method = 4
ORDER BY ad.hadm_id, ad.discharge_time;
"""
df_mortality = pd.read_sql(query12, mydb)
# list of operations with time of operation
query13 = """
SELECT DISTINCT ad.hadm_id, tv.start_time
FROM havenouh_v15.admissions AS ad
LEFT JOIN havenouh_v15.theatre_visits AS tv ON (tv.hadm_id = ad.hadm_id)
WHERE ad.hadm_id IN (""" + pop.string + """)
ORDER BY ad.hadm_id;
"""
df_OP = pd.read_sql(query13, mydb)
# load OPCS codes dataframe
infile = open('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/OPCS_codes','rb')
df_opcs = pickle.load(infile)
infile.close()
# list of procedures (on the operation day or after) with time of procedures
query14 = """
SELECT fi.hadm_id, pr.procedure_date, pr.procedure_code
FROM (
SELECT min(end_time) AS OP_end_time, hadm_id
FROM havenouh_v15.theatre_visits
GROUP BY hadm_id) AS fi
LEFT JOIN havenouh_v15.procedures AS pr ON (fi.hadm_id = pr.hadm_id)
WHERE fi.hadm_id IN (""" + pop.string + """)
AND day(pr.procedure_date) >= day(OP_end_time)
ORDER BY fi.hadm_id;
"""
df_procedures_ = pd.read_sql(query14, mydb)
df_procedures = pd.merge(df_procedures_, df_opcs, how='left', on='procedure_code')
# list of prescriptions with time of presciptions
query15 = """
SELECT prescription_id, hadm_id, order_mnemonic as drug_name, drug_form, drug_route, dose, frequency, subject_own_meds, order_placed_time as prescription_time
FROM user_baptiste_vasey.list_prescriptions
WHERE hadm_id IN (""" + pop.string + """)
ORDER BY hadm_id, order_placed_time;
"""
df_prescriptions = pd.read_sql(query15, mydb)
mydb.close()
# calculate if an unplanned ICU admission occured within <time_icu> hours of a contact point
def get_icu(df_icu):
hadm_icu = df_icu['hadm_id']
session_time = df_icu['charttime']
df_icu_temp = df_icu_admissions[df_icu_admissions['hadm_id'] == hadm_icu]
if df_icu_temp.empty == True:
score = 0
else:
if df_icu_temp['admission_time'].apply(lambda x: True if (0 < (x-session_time).total_seconds()/(60*60) <= time_icu) else False).any() == True:
score = 1
else:
score = 0
return score
# calculate if cardiac arrest occured within <time_arrest> hours of a contact point
def get_arrest(df_arrest):
hadm_arrest = df_arrest['hadm_id']
session_time = df_arrest['charttime']
df_arrest_temp = df_cardiac_arrest[df_cardiac_arrest['hadm_id'] == hadm_arrest]
if df_arrest_temp.empty == True:
score = 0
else:
if df_arrest_temp['arrest_time'].apply(lambda x: True if (0 < (x-session_time).total_seconds()/(60*60) <= time_arrest) else False).any() == True:
score = 1
else:
score = 0
return score
# calculate if a patient died within <time_death> hours of a contact point
def get_death(df_death):
hadm_death = df_death['hadm_id']
session_time = df_death['charttime']
df_death_temp = df_mortality[df_mortality['hadm_id'] == hadm_death]
if df_death_temp.empty == True:
score = 0
else:
if df_death_temp['discharge_time'].apply(lambda x: True if (0 < (x-session_time).total_seconds()/(60*60) <= time_death) else False).any() == True:
score = 1
else:
score = 0
return score
# calculate if a readmission to theatre occured within <time_reOP> hours of a contact point
def get_reOP(df_reOP):
hadm_reOP = df_reOP['hadm_id']
session_time = df_reOP['charttime']
df_reOP_temp = df_OP[df_OP['hadm_id'] == hadm_reOP]
if df_reOP_temp.empty == True:
score = 0
else:
if df_reOP_temp['start_time'].apply(lambda x: True if (0 < (x-session_time).total_seconds()/(60*60) <= time_reOP) else False).any() == True:
score = 1
else:
score = 0
return score
# fetch all procedures booked on the <max_procedure_day> days following a contact point (the procedure metadata does not allow for a better time discrimination)
def get_procedures(df_proc, df_procedures, first_day_procedure, max_procedure_day):
hadm_proc = df_proc['hadm_id']
session_day = df_proc['charttime'].date()
min_day = datetime.datetime.combine((session_day + datetime.timedelta(days=first_day_procedure)), datetime.datetime.min.time())
max_day = datetime.datetime.combine((session_day + datetime.timedelta(days=max_procedure_day)), datetime.datetime.min.time())
df_proc_temp = df_procedures[(df_procedures['hadm_id'] == hadm_proc) & (df_procedures['procedure_date'] >= min_day) & (df_procedures['procedure_date'] <= max_day)].reset_index(drop=True)
if df_proc_temp.empty == True:
procedures = ['no procedure']
else:
procedures = df_proc_temp['procedure_name'].to_list()
return procedures
# fetch all drugs prescribed during the <max_prescription_time> hours following a contact point
def get_drugs(df_drugs):
#subject_own_meds = ['POD Ward - OUH supplied', 'POD Home', 'PODP - brought in', 'POD Ward - brought in', 'PODP - OUH supplied', 'POD Fridge - brought in',
# 'PODCD - OUH supplied', 'Relabel - OUH supplied', 'Relabel - brought in', 'Community Hospital Stock', 'PODCD - brought in', 'POD Fridge - OUH supplied']
hadm_drugs = df_drugs['hadm_id']
session_time = df_drugs['charttime']
df_drugs_temp = df_prescriptions[(df_prescriptions['hadm_id'] == hadm_drugs) & df_prescriptions['subject_own_meds'].isnull()].drop_duplicates(subset = 'drug_name', keep = 'first')
df_new_drugs = df_drugs_temp[(df_drugs_temp['prescription_time'] >= session_time) & (df_drugs_temp['prescription_time'] <= (session_time + datetime. timedelta(hours=max_prescription_time)))].reset_index(drop = True)
if df_new_drugs.empty == True:
new_drugs = ['no new drugs']
else:
new_drugs = []
for i in df_new_drugs.index:
new_drugs.append(df_new_drugs.loc[i,'drug_name'])
return new_drugs
# calculate the total number of observations sets for the same hospital admission
def get_number(df_number):
hadm_number = df_number['hadm_id']
number = len(dfo[dfo['hadm_id'] == hadm_number].index)
return number
# create a new column where value = 1 if ICU admission in the next "time_icu" hours
dfo['icu_event'] = dfo.apply(lambda row: get_icu(row), axis = 1)
print ('icu done')
# create a new column where value = 1 if cardiac arrest in the next "time_arrest" hours
dfo['arrest_event'] = dfo.apply(lambda row: get_arrest(row), axis = 1)
print ('arrest done')
# create a new column where value = 1 if patient death in the next "time_death" hours
dfo['mortality_event'] = dfo.apply(lambda row: get_death(row), axis = 1)
print ('death done')
# create a new column where value = 1 if re-operation in the next "time_reOP" hours
dfo['reOP_event'] = dfo.apply(lambda row: get_reOP(row), axis = 1)
print ('reOP done')
# create a new column where value = list of procedures (OPCS codes) in the next "time_procedures" days
dfo['procedures'] = dfo.apply(lambda row: get_procedures(row, df_procedures, first_day_procedure, max_procedure_day), axis = 1)
print ('procedures done')
# create a new column where value = list of procedures (OPCS codes) in the next "time_procedures" days
dfo['new_prescriptions'] = dfo.apply(lambda row: get_drugs(row), axis = 1)
print ('prescriptions done')
# create a new column where value = total number of observations sets for the same hospital admission
dfo['number_of_sessions'] = dfo.apply(lambda row: get_number(row), axis = 1)
dfo.loc[(dfo.icu_event == 1) | (dfo.arrest_event == 1) | (dfo.mortality_event == 1) | (dfo.reOP_event == 1), 'event'] = 1
dfo.loc[(dfo.icu_event != 1) & (dfo.arrest_event != 1) & (dfo.mortality_event != 1) & (dfo.reOP_event != 1), 'event'] = 0
return dfo
print('get outcome function ready')
def refine_prescriptions (df_pres, df_trans_drug):
df_pres["new_fluid"] = np.NaN
df_pres["new_prescriptions_grouped"] = ''
df_pres.reset_index(inplace=True)
# select and group prescriptions according to the document list_prescriptions_final.cvs
for i in df_pres.index:
pres_old = pd.DataFrame(df_pres['new_prescriptions'].iloc[i], columns=['new_prescriptions'])
pres_new = pd.merge(pres_old, df_trans_drugs, left_on = 'new_prescriptions', right_on = 'prescription')
# volume is considered as prescription if a patient receives more than 1000ml in 24h
new_fluid = pres_new.quantity_ml.sum()
df_pres.loc[i, 'new_fluid'] = new_fluid
if new_fluid >= 1000:
new_row = {'new_prescriptions':'volume_tot', 'prescription':'volume_tot', 'category':'volume_tot', 'group':'volume_tot', 'included':1, 'fluid':'fluid', 'quantity_ml':new_fluid}
pres_new = pres_new.append(new_row, ignore_index=True)
prescriptions = pres_new[pres_new.included == 1]['group'].drop_duplicates().tolist()
if len(prescriptions) == 0:
prescriptions = ['no_new_drugs']
df_pres.at[i, 'new_prescriptions_grouped'] = prescriptions
# Add a column with the number of new prescriptions (no filtering)
df_pres['number_prescriptions'] = np.nan
for i in df_pres.index:
if df_pres['new_prescriptions'].iloc[i][0] == 'no new drugs':
df_pres.loc[i, 'number_prescriptions'] = 0
else:
df_pres.loc[i, 'number_prescriptions'] = len(df_pres['new_prescriptions'].iloc[i])
# Add a column with the number of new prescriptions groups (selected groups of interest)
df_pres['number_prescriptions_groups'] = np.nan
for i in df_pres.index:
if df_pres['new_prescriptions_grouped'].iloc[i][0] == 'no_new_drugs':
df_pres.loc[i, 'number_prescriptions_groups'] = 0
else:
df_pres.loc[i, 'number_prescriptions_groups'] = len(df_pres['new_prescriptions_grouped'].iloc[i])
return df_pres
def refine_procedures (df_proc, df_trans_proc):
df_proc['new_procedures_grouped'] = ''
df_proc.reset_index(inplace=True)
# select and group procedures according to the document list_procedures_final.cvs
for i in df_proc.index:
proc_old = pd.DataFrame(df_proc['procedures'].iloc[i], columns=['procedures'])
proc_new = pd.merge(proc_old, df_trans_proc, left_on = 'procedures', right_on = 'procedures')
procedures = proc_new[proc_new.included == 1]['group'].drop_duplicates().tolist()
if len(procedures) == 0:
procedures = ['no procedure']
df_proc.at[i, 'new_procedures_grouped'] = procedures
# Add a column with the number of new procedure groups (selected groups of interest)
df_proc['number_procedures_groups'] = np.nan
for i in df_proc.index:
if df_proc['new_procedures_grouped'].iloc[i][0] == 'no procedure':
df_proc.loc[i, 'number_procedures_groups'] = 0
else:
df_proc.loc[i, 'number_procedures_groups'] = len(df_proc['new_procedures_grouped'].iloc[i])
return df_proc
print('refine outcome functions ready')
#select populations of interest
pop = pop4
#select fetch windows of interest (for labs and POCT)
selected_fetch_window = 26
#---------------------------------------------------------------------------------------------------------------
start = datetime.datetime.now()
#extract the data
df_raw = extraction(pop)
df_raw = df_raw.drop(columns='URUR') # remove one of the variable found later in the analysis to have been mistakenly included
print('extraction completed')
print('number of data colection timepoints (rows): ' + str(len(df_raw.index)))
print('number of hospital admissions: ' + str(len(df_raw['hadm_id'].unique())))
#----------------------------------------------------------------------------------------------------------------
# generate the necessary support databases for the get_status function
# connect to database
mydb = mysql.connector.connect(
host="mysql8",
user= os.getenv('USERNAMEH'),
passwd= os.getenv('PASSWORDH'),
database="havenouh_v15"
)
# get list of all observation taken while in ICU
query3 = """
SELECT vlp.*,
co.subject_id, co.gender, co.age, co.hospital_code, co.treatment_function_code, co.admission_time, co.discharge_time, co.cci, co.weight, co.BMI,
icu.admission_time as ICU_admission_time, icu.discharge_time as ICU_discharge_time
FROM user_baptiste_vasey.vitals_labs_POCTs vlp
LEFT JOIN user_baptiste_vasey.cohort co
ON vlp.hadm_id = co.hadm_id
LEFT JOIN (
SELECT subject_id, admission_time, discharge_time
FROM havenouh_v15.icustays
ORDER BY subject_id
) icu
ON co.subject_id = icu.subject_id
WHERE vlp.hadm_id IN (""" + pop.string + """)
AND vlp.obstime >= icu.admission_time
AND vlp.obstime <= icu.discharge_time
ORDER BY vlp.hadm_id, vlp.obstime;"""
df_list_ICUobs = pd.read_sql(query3, mydb)
# get list of all observation taken while in theatre or <1 hour postop
query4 = """
SELECT vlp.*,
co.subject_id,
th.start_time as OP_start_time, th.end_time as OP_end_time
FROM user_baptiste_vasey.vitals_labs_POCTs vlp
LEFT JOIN user_baptiste_vasey.cohort co
ON vlp.hadm_id = co.hadm_id
LEFT JOIN (
SELECT hadm_id, start_time, end_time
FROM havenouh_v15.theatre_visits
) th
ON vlp.hadm_id = th.hadm_id
WHERE vlp.hadm_id IN (""" + pop.string + """)
AND vlp.obstime >= th.start_time
AND vlp.obstime <= DATE_ADD(th.end_time, INTERVAL 1 HOUR)
ORDER BY vlp.hadm_id, vlp.obstime;"""
df_list_OPobs = pd.read_sql(query4, mydb)
mydb.close()
# attribute provenance status
df_raw['status'] = df_raw.apply(lambda x: get_status(x, df_list_ICUobs, df_list_OPobs), axis = 1)
print('status completed')
#-------------------------------------------------------------------------------------------------
end = datetime.datetime.now()
run_time = (end - start)
print('run time: ' + str(run_time))
#select populations of interest
pop = pop4
#select fetch windows of interest (for labs and POCT)
selected_fetch_window = 26
start = datetime.datetime.now()
print(pop.name)
#---------------------------------------------------------------------------------------------------------------
# create a column counting the number of existing observation in a set (discard sets with only one type of obs, such as temperature)
df_raw['n_obs_set'] = np.nan
df_raw['n_obs_set'] = df_raw[['HR','RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU', 'FIO2_LMIN']].apply(lambda x: x.count(), axis=1)
# create a dataframe with all observation set taken on the ward, with lab and POCT values fetch in the predefined window => used to quantify missing data
df_start=df_raw[(df_raw['n_obs_set'] >= 6) & (df_raw['status'].isin([2, 6]))].copy(deep=True)
df_start.drop_duplicates(subset=['obstime'], inplace=True)
df_start = fetch_labs(df_start, df_raw, selected_fetch_window)
print('initial database complete')
print('number of contact points (rows): ' + str(len(df_start.index)))
print('number of hospital admissions: ' + str(len(df_start['hadm_id'].unique())))
#----------------------------------------------------------------------------
end = datetime.datetime.now()
run_time = (end - start)
print('run time: ' + str(run_time))
#select populations of interest
pop = pop4
#select fetch windows of interest (for labs and POCT)
selected_fetch_window = 26
#---------------------------------------------------------------------------------------------------------------
start = datetime.datetime.now()
print(pop.name)
# impute missing vital signs and get NEWS score
df_imputed = vitals_complete(df_start)
df_imputed = get_NEWS(df_imputed)
print('vitals imputation and NEWS calculation complete')
end = datetime.datetime.now()
run_time = (end - start)
print('run time: ' + str(run_time))
#----------------------------------------------------------------------------
# add columns recording missingness
df_imputed = track_missingness(df_imputed)
print('track missingness complete')
end = datetime.datetime.now()
run_time = (end - start)
print('run time: ' + str(run_time))
#----------------------------------------------------------------------------
# impute missing labs and POCT values
df_imputed = impute_labs(df_imputed, df_raw)
print('labs imputation complete')
end = datetime.datetime.now()
run_time = (end - start)
print('run time: ' + str(run_time))
#----------------------------------------------------------------------------
# create a minmax column for all vitals (min-max amplitude in the last 26h)
df_imputed = min_max(df_imputed, df_raw)
print('minmax complete')
end = datetime.datetime.now()
run_time = (end - start)
print('run time: ' + str(run_time))
#----------------------------------------------------------------------------
# create a delta column for all vitals (delta since last measurmement)
df_imputed = delta_vitals(df_imputed,df_raw)
print('delta vitals complete')
end = datetime.datetime.now()
run_time = (end - start)
print('run time: ' + str(run_time))
#----------------------------------------------------------------------------
# create a delta column for all labs + POCT (delta since last measurmement)
df_imputed = delta_labs(df_imputed,df_raw)
print('delta labs complete')
end = datetime.datetime.now()
run_time = (end - start)
print('run time: ' + str(run_time))
#----------------------------------------------------------------------------
print('imputed database complete')
print('number of contact points (rows): ' + str(len(df_imputed.index)))
print('number of hospital admissions: ' + str(len(df_imputed['hadm_id'].unique())))
#select populations of interest
pop = pop4
# define time window of interest in the future (times are in hours, except time_procedure which is in days, 0 being same day procedures)
icu = 24 # time in hours before ICU admission
arrest = 24 # time in hours before cardiac arrest
death = 24 # time in hours before death
reOP = 24 # time in hours before re-operation
first_day_procedure = 1 # 0 if want to consider same day procedure, 1 otherwise
max_procedure_day = 1 # time in days into the future to consider procedures, 0 being same day procedure, must be >= first_day_procedure
max_prescription_time = 24 # time in hours into the future to consider new prescription
start = datetime.datetime.now()
print(pop.name)
df_complete = get_outcomes(df_imputed, time_icu = icu, time_arrest = arrest, time_death = death, time_reOP = reOP, first_day_procedure = first_day_procedure,
max_procedure_day = max_procedure_day, max_prescription_time = max_prescription_time)
print('database with outcome complete')
print('number of contact points (rows): ' + str(len(df_imputed.index)))
print('number of hospital admissions: ' + str(len(df_imputed['hadm_id'].unique())))
end = datetime.datetime.now()
run_time = (end - start)
print('run time: ' + str(run_time))
# load the prescription database with drug categories grouping
df_trans_drugs = pd.read_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/list_prescriptions_final.csv')
# load the procedure database with procedures grouping
df_trans_proc = pd.read_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/list_procedures_final.csv')
start = datetime.datetime.now()
print(pop.name)
# pre-process and refine the prescriptions
df_clustering_ready = refine_prescriptions(df_complete, df_trans_drugs)
print('prescriptions done')
# pre-process and refine the procedures
df_clustering_ready = refine_procedures(df_clustering_ready, df_trans_proc)
print('procedures done')
print('database with outcome complete')
print('number of contact points (rows): ' + str(len(df_clustering_ready.index)))
print('number of hospital admissions: ' + str(len(df_clustering_ready['hadm_id'].unique())))
end = datetime.datetime.now()
run_time = (end - start)
print('run time: ' + str(run_time))
df_analysis = df_clustering_ready.copy()
#select sampling of interest
sampling_meth = 'first' # define sampling method:
# all = all postoperative observations on the ward
# allalarm = all observation triggering an alarm postoperatively on the ward,
# first = first observation triggering an alarm postoperatively on the ward,
# oneofeach = first observation triggering an alarm postoperatively on the ward + one random observation set for all the other admissions
# in addition for the samplin method "firts" and "oneofeach", only observation set within 30 ays of the last operation are considered.
if sampling_meth == 'all':
df_analysis = df_analysis[(df_analysis['status'] == 2)].copy(deep=True)
elif sampling_meth == 'allalarm':
df_analysis = df_analysis[(df_analysis['status'] == 2) & (df_analysis['alarm'] == 1)].copy(deep=True)
elif sampling_meth == 'first':
df_analysis = df_analysis[(df_analysis['status'] == 2) & (df_analysis['alarm'] == 1) & (df_analysis['post_lastOP_time_hours'] < 720)].drop_duplicates(subset=['hadm_id'], keep='first')
elif sampling_meth == 'oneofeach':
# select the first observation set triggerng an alarm for each hospital admission triggering at least one alarm
df_analysis_1 = df_analysis[(df_analysis['status'] == 2) & (df_analysis['alarm'] == 1) & (df_analysis['post_lastOP_time_hours'] < 720)].drop_duplicates(subset=['hadm_id'], keep='first')
hadm_alarm = list(df_analysis_1['hadm_id'].unique())
# select one random observation set in each of the remaining admissions
df_analysis_2 = df_analysis[(~df_analysis['hadm_id'].isin(hadm_alarm)) & (df_analysis['status'] == 2) & (df_analysis['alarm'] == 0) & (df_analysis['post_lastOP_time_hours'] < 720)].sample(frac=1, random_state = 2206).drop_duplicates(subset=['hadm_id'], keep='first')
df_analysis = pd.concat([df_analysis_1, df_analysis_2])
else:
print("error sampling method")
# hold out test set (df_test)
X_gss = df_analysis
y_gss = df_analysis.hospital_code # only to allow function to run
group = df_analysis.subject_id # prevent data leakage from same patient with several hospital admission
gss = GroupShuffleSplit(n_splits=2, train_size=.8, random_state=2206)
gss.get_n_splits()
train_ix, test_ix = next(gss.split(X_gss, y_gss, groups=group))
df_train = X_gss.iloc[train_ix].copy()
df_test = X_gss.iloc[test_ix].copy()
df_train.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)
print('length index (contact points) after sampling: ' + str(len(df_analysis.index)))
print('from ' + str(len(df_analysis['hadm_id'].unique())) + ' hospital admissions' )
print('representing ' + str(len(df_analysis['subject_id'].unique())) + 'patients')
print('length index train set: ' + str(len(df_train.index)))
print('from ' + str(len(df_train['hadm_id'].unique())) + ' hospital admissions' )
print('representing ' + str(len(df_train['subject_id'].unique())) + 'patients')
print('length index test set: ' + str(len(df_test.index)))
print('from ' + str(len(df_test['hadm_id'].unique())) + ' hospital admissions' )
print('representing ' + str(len(df_test['subject_id'].unique())) + 'patients')
print('sampling completed')
def characteristics(df, marker):
print(marker)
n_patients=len(df['subject_id'].unique())
n_admissions=len(df['hadm_id'].unique())
n_observations=len(df)
n_alarms=len(df[df['alarm']==1])
n_events=len(df[df['event']==1])
PPV=len(df[(df['alarm']==1) & (df['event']==1)])/len(df[df['alarm']==1])
sensitivity=len(df[(df['alarm']==1) & (df['event']==1)])/len(df[df['event']==1])
n_hadm_events=len(df[df['event']==1]['hadm_id'].unique())
n_hadm_arrest=len(df[df['arrest_event']==1]['hadm_id'].unique())
n_hadm_ICU=len(df[df['icu_event']==1]['hadm_id'].unique())
n_hadm_death=len(df[df['mortality_event']==1]['hadm_id'].unique())
n_hadm_reOP=len(df[df['reOP_event']==1]['hadm_id'].unique())
age=df.drop_duplicates('hadm_id').age.median()
age25, age75=np.percentile(df.drop_duplicates('hadm_id').age, [25 ,75])
cci=df.drop_duplicates('hadm_id').cci.median()
cci25, cci75=np.percentile(df.drop_duplicates('hadm_id').cci, [25 ,75])
print('n_patients = ' + str(n_patients))
print('n_admissions = ' + str(n_admissions))
print('n_observations = ' + str(n_observations))
print('n_alarms = ' + str(n_alarms))
print('n_events = ' + str(n_events))
print('PPV = ' + str(PPV))
print('sensitivity = ' + str(sensitivity))
print('n_hadm_events = ' + str(n_hadm_events))
print('n_hadm_arrest = ' + str(n_hadm_arrest))
print('n_hadm_ICU = ' + str(n_hadm_ICU))
print('n_hadm_death = ' + str(n_hadm_death))
print('n_hadm_reOP = ' + str(n_hadm_reOP))
print('age = ' + str(age))
print('age25 = ' + str(age25))
print('age75 = ' + str(age75))
print('cci = ' + str(cci))
print('cci25 = ' + str(cci25))
print('cci75 = ' + str(cci75))
print('---------------------------------')
hadm_event = list(df_analysis[df_analysis['event'] == 1]['hadm_id'].unique())
df_analysis_1 = df_analysis[df_analysis['hadm_id'].isin(hadm_event)]
df_analysis_2 = df_analysis[(df_analysis['status'] == 2) & (df_analysis['alarm'] == 1) & (df_analysis['post_lastOP_time_hours'] < 720)].drop_duplicates(subset=['hadm_id'], keep='first')
df_analysis_3 = df_analysis_2[(df_analysis_2['event'] == 1)]
df_analysis_ideal = df_analysis[(df_analysis['status'] == 2) & (df_analysis['alarm'] == 1) & (df_analysis['event'] == 1) & (df_analysis['post_lastOP_time_hours'] < 720)].drop_duplicates(subset=['hadm_id'], keep='first')
df_analysis_test = df_analysis[(df_analysis['status'] == 2) & (df_analysis['alarm'] == 1) & (df_analysis['post_lastOP_time_hours'] < 720)].sample(frac=1).drop_duplicates(subset=['hadm_id'], keep='first')
characteristics(df_analysis, 'All')
characteristics(df_analysis_1, 'At least 1 event')
characteristics(df_analysis_2, 'first NEWS')
characteristics(df_analysis_3, 'first NEWS with event')
characteristics(df_analysis_test, 'random selection')
characteristics(df_analysis_ideal, 'ideal')
# prescriptions
ax=df_analysis['number_prescriptions_groups'].hist(bins=15)
ax.set_title('Distribution of the number of new prescription within 24 hours \n for '+pop.description)
ax.set_xlabel('number of new prescriptions')
ax.set_ylabel('number of observation sets')
plt.show()
unique_set = set()
for i in df_analysis.index:
index_pres = df_analysis.new_prescriptions_grouped[i]
new_pres = set(index_pres) - unique_set
unique_set.update(new_pres)
# number and name of unique prescription groups in the sampled observations
print('groups present in the sampling of interest: ' + str(len(unique_set)))
print(unique_set)
# number and details of prescription regimes in the sampled observations
li = list(df_analysis.new_prescriptions_grouped)
count = Counter(str(e) for e in li)
print('\nOccurence of prescription groups combinations: ' + str(len(count)))
print(count)
print(df_analysis['number_prescriptions_groups'].describe())
# procedures
ax=df_analysis['number_procedures_groups'].hist(bins=15)
ax.set_title('Distribution of the number of new procedure within 24 hours \n for the sampled contact points')
ax.set_xlabel('number of new procedures')
ax.set_ylabel('number of observation sets')
plt.show()
unique_set = set()
for i in df_analysis.index:
index_proc = df_analysis.new_procedures_grouped[i]
new_proc = set(index_proc) - unique_set
unique_set.update(new_proc)
# number and name of unique procedure groups in the sampled observations
print('groups present in the sampling of interest: ' + str(len(unique_set)))
print(unique_set)
# number and details of procedure regimes in the sampled observations
li = list(df_analysis.new_procedures_grouped)
count = Counter(str(e) for e in li)
print('\nOccurence of procedure groups combinations: ' + str(len(count)))
print(count)
print(df_analysis['number_procedures_groups'].describe())
All analyses below should be repeated for each input features set
# select method of input selection:
#"all" = all features
#"missingness" = all vital signs + all admin data + all labs with less than the specified missingness rate
#"vitals" = only vital signs (same as in NEWS)
#"hand_picked" = selected list of variables based on the author clinical experience
method = 'missingness'
# max % of missing values (only if method = missingness)
max_missing = 66.66
# define input features
input_features = ['HR','RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU', 'ESTIMATED_FIO2_2', 'ALB', 'ALP', 'ALT', 'APTT', 'AST', 'BAS',
'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP','COCA', 'EGFR', 'EOS', 'GLU', 'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM', 'MCH', 'MCHC', 'MCV', 'METHB',
'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD', 'TBIL', 'TROP', 'UR', 'WBC','age', 'cci','post_firstOP_time_hours', 'post_lastOP_time_hours']
if method == 'all':
variables = input_features
elif method == 'missingness':
# calculate the rate of missing data for each feature (considered missing if no value in the 26h preceeding the contact point), only apply to lab variables
missing = df_analysis[['ALB_miss', 'ALP_miss', 'ALT_miss', 'APTT_miss', 'AST_miss', 'BAS_miss', 'BASEX_miss', 'BICAR_miss', 'CAL_miss', 'CL_miss',
'CR_miss', 'CRP_miss', 'COCA_miss', 'EGFR_miss', 'EOS_miss', 'GLU_miss', 'HCT_miss', 'HGB_miss', 'INR_miss', 'KET_miss',
'LAC_miss', 'LYM_miss', 'MCH_miss', 'MCHC_miss', 'MCV_miss', 'METHB_miss', 'MON_miss', 'MPV_miss', 'NEU_miss', 'OSM_miss', 'PH_miss',
'PLT_miss', 'POT_miss', 'RBC_miss', 'SOD_miss', 'TBIL_miss', 'TROP_miss', 'UR_miss', 'WBC_miss']].mean()*100
variables_ = missing[missing>(100-max_missing)].index.to_list()
variables=[]
for var_ in variables_:
var = var_.replace('_miss', '')
variables.append(var)
variables = ['HR','RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU', 'ESTIMATED_FIO2_2'] + variables + ['age', 'cci','post_firstOP_time_hours', 'post_lastOP_time_hours']
elif method == 'hand_picked':
variables = ['HR','RR', 'SBP', 'SPO2', 'TEMP', 'ESTIMATED_FIO2_2', 'ALB', 'ALP', 'ALT', 'APTT', 'AST','BICAR', 'CR', 'CRP', 'EGFR', 'GLU', 'HCT', 'HGB', 'INR',
'LAC', 'PH', 'PLT', 'POT', 'SOD', 'TBIL', 'WBC','age', 'cci','post_firstOP_time_hours', 'post_lastOP_time_hours']
elif method == 'vitals':
variables = ['HR','RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU', 'ESTIMATED_FIO2_2']
else:
print('input features selection error')
# add delta to lab values in variables
for var in variables:
if var in ['ALB', 'ALP', 'ALT', 'APTT', 'AST', 'BAS', 'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP','COCA', 'EGFR', 'EOS', 'GLU', 'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM',
'MCH', 'MCHC', 'MCV', 'METHB', 'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD', 'TBIL', 'TROP', 'UR', 'WBC']:
var0 = var + '_delta'
variables.append(var0)
# add delta and minmax to vitals in variables
for vitals in variables:
if vitals in ['HR','RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU', 'ESTIMATED_FIO2_2']:
vitals_minmax = vitals + '_minmax'
variables.append(vitals_minmax)
vitals_delta = vitals + '_delta'
variables.append(vitals_delta)
print(variables)
print('initial dimensionality: ' + str(len(variables)))
# describe autoencoder parameters to optimise
N_layers = [2, 4, 8, 16]
N_neurons = [64, 128, 256, 512]
Latent_dimension = [4, 8, 16, 32]
results = np.zeros((len(N_layers), len(N_neurons), len(Latent_dimension)))
np.random.seed(1301)
random.set_seed(2206)
start=datetime.datetime.now()
for n_layers in N_layers:
print('+++++++++++++++')
pos_layer = N_layers.index(n_layers)
for n_neurons in N_neurons:
print('------------')
pos_neuron = N_neurons.index(n_neurons)
for latent_dimension in Latent_dimension:
pos_latent = Latent_dimension.index(latent_dimension)
print([n_layers, n_neurons, latent_dimension])
vector_res = []
# kfold crossvalidation
X_gss = df_train
y_gss = df_train.hospital_code # only to allow function to run
group = df_train.subject_id # prevent data leakage from same patient with several hospital admission
group_kfold = GroupKFold(n_splits=8) # equivalent to a 70:10:20 split overall
group_kfold.get_n_splits()
for train_index, val_index in group_kfold.split(X_gss, y_gss, groups = group):
df_opti = X_gss.iloc[train_index]
df_val = X_gss.iloc[val_index]
X = df_opti[variables]
X_val = df_val[variables]
dimension = len(variables)
# standardisation
X_unscaled = X.to_numpy()
X_scaled = StandardScaler().fit_transform(X_unscaled)
X_val_unscaled = X_val.to_numpy()
X_val_scaled = StandardScaler().fit_transform(X_val_unscaled)
# define encoder architecture
input_data = keras.Input(shape=(dimension,))
encoded = layers.Dense(n_neurons, activation='relu')(input_data)
for encoder_layer in range(1, n_layers): # enable optimisation on the number of layers
encoded = layers.Dense(n_neurons, activation='relu')(encoded)
# latent space
encoded = layers.Dense(latent_dimension, activation='relu')(encoded)
# define decoder architecture
decoded = layers.Dense(n_neurons, activation='relu')(encoded)
for decoder_layer in range(1, n_layers): # idem
decoded = layers.Dense(n_neurons, activation='relu')(decoded)
decoded = layers.Dense(dimension)(decoded)
autoencoder = keras.Model(input_data, decoded)
autoencoder.compile(optimizer='adam', loss='mean_absolute_error')
# fit autoencoder
AE = autoencoder.fit(X_scaled, X_scaled, epochs=100, batch_size=32, shuffle=True,
validation_data=(X_val_scaled, X_val_scaled), verbose = 0)
# update results matrix
res_ = AE.history['val_loss'][-1]
vector_res.append(res_)
res = np.mean(vector_res)
print(res)
results[pos_layer, pos_neuron, pos_latent] = res
best_param = np.argwhere(results == np.amin(results))
print(results)
print('Best parameters and loss')
print(best_param[0])
print(results[(best_param[0][0],best_param[0][1],best_param[0][2])])
end=datetime.datetime.now()
print('runtime: ' + str(end-start))
# save optimisation table if needed
results_ = results
names = ['x', 'y', 'z']
index = pd.MultiIndex.from_product([range(s)for s in results_.shape], names=names)
df = pd.DataFrame({'A': results_.flatten()}, index=index)['A']
df = df.unstack(level='z').sort_index()
df.columns = ['Latent_4', 'Latent_8', 'Latent_16', 'Latent_32']
df.index.names = ['Layer', 'Neuron']
print(df)
# save the parameter optimisation table if needed
df.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_optimisation_first_vitals.csv')
start=datetime.datetime.now()
#dim_reduction = ['none', 'PCA', 'AE']
#optimisation_method = ['elbow', 'silhouette']
# define the range of cluster number, latent dimensions (autoencoder), and variances explained by the principal components to include in the optimisation
number_clusters = range(2,31)
latent_dimension = range(int(len(variables)/5), len(variables), int(len(variables)/5))
n_components = [0.75, 0.8, 0.85, 0.9, 0.95]
np.random.seed(1301)
random.set_seed(2206)
def f_cluster_opti(variables, dim, method, n_components, n_latdim, n_clusters, df_train):
# select optimal autoencoder architecture according to the input features set (see autoencoder parameter tuning)
n_neurons = 128
n_layers = 2
score_method = []
df_results = pd.DataFrame(index = n_clusters)
X = df_train[variables]
# standardisation
X_unscaled = X.to_numpy()
X_scaled = StandardScaler().fit_transform(X_unscaled)
# dimensionality reduction
if dim == 'PCA':
pca = PCA(n_components=n_components, svd_solver='full')
X_latent = pca.fit_transform(X_scaled)
elif dim == 'AE':
dimension = len(variables)
batch = 32
# define encoder architecture
input_data = keras.Input(shape=(dimension,))
encoded = layers.Dense(n_neurons, activation='relu')(input_data)
for encoder_layer in range(1, n_layers):
encoded = layers.Dense(n_neurons, activation='relu')(encoded)
# latent space
encoded = layers.Dense(n_latdim, activation='relu')(encoded)
# define decoder architecture
decoded = layers.Dense(n_neurons, activation='relu')(encoded)
for decoder_layer in range(1, n_layers):
decoded = layers.Dense(n_neurons, activation='relu')(decoded)
decoded = layers.Dense(dimension)(decoded)
autoencoder = keras.Model(input_data, decoded)
encoder = keras.Model(input_data, encoded)
autoencoder.compile(optimizer='adam', loss='mean_absolute_error')
# fit autoencoder
autoencoder.fit(X_scaled, X_scaled, epochs=100, batch_size=batch, shuffle=True, verbose = 0)
X_latent = encoder.predict(X_scaled)
elif dim == 'none':
X_latent = X_scaled
else:
print('dimensionality reduction error')
# cluster number optimisation method
results_cluster = []
for n in n_clusters:
if method == 'elbow':
seed_kmeans = np.random.randint(1,1001) # increase cluster search diversity
kmeansModel = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100, n_clusters=n, n_init=10,
random_state=seed_kmeans, tol=0.0001, verbose=0)
results_opti = kmeansModel.fit_predict(X_latent)
score_ = kmeansModel.inertia_
elif method == 'silhouette':
seed_kmeans = np.random.randint(1,1001) # increase cluster search diversity
kmeansModel = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100, n_clusters=n, n_init=10,
random_state=seed_kmeans, tol=0.0001, verbose=0)
results_opti = kmeansModel.fit_predict(X_latent)
score_ = metrics.silhouette_score(X_latent, results_opti, metric='euclidean')
else:
print('evaluation method error')
results_cluster.append(score_)
return results_cluster
#--------------------------------------------------------------------------------------------------------
# get the inertia values (elbow method) and silhouette scores for each combination of k-means and dimensionality reduction method
df_opti_none_elbow = pd.DataFrame(index = number_clusters, columns = ['none_elbow'])
new_col = f_cluster_opti(variables, dim='none', method = 'elbow', n_components = 'not needed', n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_none_elbow['none_elbow'] = new_col
print('df_opti_none_elbow done')
df_opti_none_silhouette = pd.DataFrame(index = number_clusters, columns = ['none_silhouette'])
new_col = f_cluster_opti(variables, dim='none', method = 'silhouette', n_components = 'not needed', n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_none_silhouette['none_silhouette'] = new_col
print('df_opti_none_silhouette done')
df_opti_PCA_elbow = pd.DataFrame(index = number_clusters, columns = ['PCA_elbow_0', 'PCA_elbow_1', 'PCA_elbow_2', 'PCA_elbow_3', 'PCA_elbow_4'])
for exvar, j in zip(n_components, [0, 1, 2, 3, 4]):
new_col = f_cluster_opti(variables, dim='PCA', method = 'elbow', n_components = exvar, n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_PCA_elbow['PCA_elbow_'+str(j)] = new_col
print('df_opti_PCA_elbow done')
df_opti_PCA_silhouette = pd.DataFrame(index = number_clusters, columns = ['PCA_silhouette_0', 'PCA_silhouette_1', 'PCA_silhouette_2', 'PCA_silhouette_3', 'PCA_silhouette_4'])
for exvar, j in zip(n_components, [0, 1, 2, 3, 4]):
new_col = f_cluster_opti(variables, dim='PCA', method = 'silhouette', n_components = exvar, n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_PCA_silhouette['PCA_silhouette_'+str(j)] = new_col
print('df_opti_PCA_silhouette done')
df_opti_AE_elbow = pd.DataFrame(index = number_clusters, columns = ['AE_elbow_0', 'AE_elbow_1', 'AE_elbow_2', 'AE_elbow_3', 'AE_elbow_4'])
for latdim, j in zip(latent_dimension, [0, 1, 2, 3, 4]):
new_col = f_cluster_opti(variables, dim='AE', method = 'elbow', n_components = 'not needed', n_latdim = latdim, n_clusters = number_clusters, df_train = df_train)
df_opti_AE_elbow['AE_elbow_'+str(j)] = new_col
print('df_opti_AE_elbow done')
df_opti_AE_silhouette = pd.DataFrame(index = number_clusters, columns = ['AE_silhouette_0', 'AE_silhouette_1', 'AE_silhouette_2', 'AE_silhouette_3', 'AE_silhouette_4'])
for latdim, j in zip(latent_dimension, [0, 1, 2, 3, 4]):
new_col = f_cluster_opti(variables, dim='AE', method = 'silhouette', n_components = 'not needed', n_latdim = latdim, n_clusters = number_clusters, df_train = df_train)
df_opti_AE_silhouette['AE_silhouette_'+str(j)] = new_col
print('df_opti_AE_silhouette done')
# save results for reuse in graph production if needed
df_opti_none_elbow.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_none_elbow_first_missingness.csv')
df_opti_none_silhouette.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_none_silhouette_first_missingness.csv')
df_opti_PCA_elbow.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_PCA_elbow_first_missingness.csv')
df_opti_PCA_silhouette.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_PCA_silhouette_first_missingness.csv')
df_opti_AE_elbow.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_AE_elbow_first_missingness.csv')
df_opti_AE_silhouette.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_AE_silhouette_first_missingness.csv')
# create data visualisation graphs
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(15,20))
fig.subplots_adjust(hspace=0.4)
axs=axs.ravel()
axs[0].plot(df_opti_none_elbow['none_elbow'])
axs[0].set_title('k-means without dimensionality reduction,\n elbow method', fontsize = 16, pad = 15)
axs[0].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[0].set_ylabel('Inertia', fontsize = 12, labelpad = 10)
axs[1].plot(df_opti_none_silhouette['none_silhouette'])
axs[1].set_title('k-means without dimensionality reduction,\n silhouette score', fontsize = 16, pad = 15)
axs[1].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[1].set_ylabel('Silhouette score', fontsize = 12, labelpad = 10)
axs[2].plot(df_opti_PCA_elbow[['PCA_elbow_0', 'PCA_elbow_1', 'PCA_elbow_2', 'PCA_elbow_3', 'PCA_elbow_4']])
axs[2].set_title('PCA + k-means,\n elbow method', fontsize = 16, pad = 15)
axs[2].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[2].set_ylabel('Inertia', fontsize = 12, labelpad = 10)
axs[2].legend(['exvar: '+str(n_components[0]), 'exvar: '+str(n_components[1]), 'exvar: '+str(n_components[2]), 'exvar: '+str(n_components[3]), 'exvar: '+str(n_components[4])], loc = 'upper right')
axs[3].plot(df_opti_PCA_silhouette[['PCA_silhouette_0', 'PCA_silhouette_1', 'PCA_silhouette_2', 'PCA_silhouette_3', 'PCA_silhouette_4']])
axs[3].set_title('PCA + k-means,\n silhouette score', fontsize = 16, pad = 15)
axs[3].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[3].set_ylabel('Silhouette score', fontsize = 12, labelpad = 10)
axs[3].legend(['exvar: '+str(n_components[0]), 'exvar: '+str(n_components[1]), 'exvar: '+str(n_components[2]), 'exvar: '+str(n_components[3]), 'exvar: '+str(n_components[4])], loc = 'upper right')
axs[4].plot(df_opti_AE_elbow[['AE_elbow_0', 'AE_elbow_1', 'AE_elbow_2', 'AE_elbow_3', 'AE_elbow_4']])
axs[4].set_title('Autoencoder + k-means,\n elbow method', fontsize = 16, pad = 15)
axs[4].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[4].set_ylabel('Inertia', fontsize = 12, labelpad = 10)
axs[4].legend(['latdim: '+str(latent_dimension[0]), 'latdim: '+str(latent_dimension[1]), 'latdim: '+str(latent_dimension[2]), 'latdim: '+str(latent_dimension[3]), 'latdim: '+str(latent_dimension[4])], loc = 'upper right')
axs[5].plot(df_opti_AE_silhouette[['AE_silhouette_0', 'AE_silhouette_1', 'AE_silhouette_2', 'AE_silhouette_3', 'AE_silhouette_4']])
axs[5].set_title('Autoencoder + k-means,\n silhouette score', fontsize = 16, pad = 15)
axs[5].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[5].set_ylabel('Silhouette score', fontsize = 12, labelpad = 10)
axs[5].legend(['latdim: '+str(latent_dimension[0]), 'latdim: '+str(latent_dimension[1]), 'latdim: '+str(latent_dimension[2]), 'latdim: '+str(latent_dimension[3]), 'latdim: '+str(latent_dimension[4])], loc = 'upper right')
plt.savefig('C:/Users/baptiste.vasey/Documents/Thesis/Figures/SuppFigElbowSilhouette_first_missingness_' + pop.name + '.pdf', bbox_inches = 'tight')
plt.show()
Values for hierarchical clustering are dysplayed for comparison only - a different cluster optimisation method was used in practice
start=datetime.datetime.now()
# custom score for cluster number optimisation using the distribution of distances from cluster centroids (abnormality detection). Should be minimised.
def clustering_score(df_opti, df_val, variables, centroids, X_opti_latent, X_val_latent):
def probability_function(cluster, centroids, target, df_opti, X_opti_latent):
df_cluster = df_opti[df_opti['cluster']==cluster]
d = np.square(X_opti_latent[df_cluster.index] - centroids[cluster]).sum(axis=1) # Euclidean distance
d_target = np.square(target - centroids[cluster]).sum()
density_below = len(d[d<d_target])/len(d)
score_density = -np.log(1-density_below+0.01) # 0.01 added to avoid log(0)=inf
return score_density
score = []
for i in df_val.index:
cluster = df_val.loc[i, 'cluster']
score_ = probability_function(cluster, centroids, X_val_latent[i], df_opti, X_opti_latent)
score.append(score_)
final_score = (np.mean(score))
return final_score
#dim_reduction = ['none', 'PCA', 'AE']
#clustering_method = ['kmeans', 'hierarchical']
# define the range of cluster number, latent dimensions (autoencoder), and variances explained by the principal components to include in the optimisation
number_clusters = range(2,31)
latent_dimension = range(int(len(variables)/5), len(variables), int(len(variables)/5))
n_components = [0.75, 0.8, 0.85, 0.9, 0.95]
np.random.seed(1301)
random.set_seed(2206)
def f_cluster_opti(variables, dim, method, n_components, n_latdim, n_clusters, df_train):
# select optimal autoencoder architecture according to the input features set (see autoencoder parameter tuning)
n_neurons = 64
n_layers = 2
score_method = []
# kfold crossvalidation
X_gss = df_train
y_gss = df_train.number_prescriptions_groups # only to allow function to run
group = df_train.subject_id # prevent data leakage from same patient with several hospital admission
group_kfold = GroupKFold(n_splits=8) # equivalent to a 70:10:20 split overall
group_kfold.get_n_splits()
df_results = pd.DataFrame(index = n_clusters)
k = 1
for opti_index, val_index in group_kfold.split(X_gss, y_gss, groups = group):
df_opti = X_gss.iloc[opti_index].copy()
df_val = X_gss.iloc[val_index].copy()
df_opti.reset_index(inplace=True, drop=True)
df_val.reset_index(inplace=True, drop=True)
X = df_opti[variables]
X_val = df_val[variables]
# standardisation
X_unscaled = X.to_numpy()
X_scaled = StandardScaler().fit_transform(X_unscaled)
X_val_unscaled = X_val.to_numpy()
X_val_scaled = StandardScaler().fit_transform(X_val_unscaled)
# dimensionality reduction
if dim == 'PCA':
pca = PCA(n_components=n_components, svd_solver='full')
X_latent = pca.fit_transform(X_scaled)
X_val_latent = pca.transform(X_val_scaled)
elif dim == 'AE':
dimension = len(variables)
batch = 32
# define encoder architecture
input_data = keras.Input(shape=(dimension,))
encoded = layers.Dense(n_neurons, activation='relu')(input_data)
for encoder_layer in range(1, n_layers):
encoded = layers.Dense(n_neurons, activation='relu')(encoded)
# latent space
encoded = layers.Dense(n_latdim, activation='relu')(encoded)
# define decoder architecture
decoded = layers.Dense(n_neurons, activation='relu')(encoded)
for decoder_layer in range(1, n_layers):
decoded = layers.Dense(n_neurons, activation='relu')(decoded)
decoded = layers.Dense(dimension)(decoded)
autoencoder = keras.Model(input_data, decoded)
encoder = keras.Model(input_data, encoded)
autoencoder.compile(optimizer='adam', loss='mean_absolute_error')
# fit autoencoder
autoencoder.fit(X_scaled, X_scaled, epochs=100, batch_size=batch, shuffle=True, verbose = 0)
X_latent = encoder.predict(X_scaled)
X_val_latent = encoder.predict(X_val_scaled)
elif dim == 'none':
X_latent = X_scaled
X_val_latent = X_val_scaled
else:
print('dimensionality reduction error')
# clustering
results_cluster = []
for n in n_clusters:
if method == 'kmeans':
seed_kmeans = np.random.randint(1,1001) # increase cluster search diversity
kmeansModel = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100, n_clusters=n, n_init=10,
random_state=seed_kmeans, tol=0.0001, verbose=0)
results_opti = kmeansModel.fit_predict(X_latent)
results_val = kmeansModel.predict(X_val_latent)
df_opti.loc[:,'cluster'] = results_opti.tolist()
df_val.loc[:,'cluster'] = results_val.tolist()
centroids = kmeansModel.cluster_centers_
score_clust = clustering_score(df_opti, df_val, variables, centroids, X_latent, X_val_latent)
elif method == 'hierarchical':
h_clust = AgglomerativeClustering(n_clusters = n, affinity = 'euclidean', linkage ='ward')
results_opti = h_clust.fit_predict(X_latent)
df_opti.loc[:,'cluster'] = results_opti.tolist()
for i in df_val.index:
dist = np.square(X_latent - X_val_latent[i]).sum(axis=1)
df_opti['dist']=dist
target_cluster = np.argmin(df_opti.groupby(['cluster'])['dist'].mean())
df_val.loc[i,'cluster'] = target_cluster
df_val['cluster'] = df_val['cluster'].apply(np.int64)
# calculate cluster centroids
points = X_latent
labels = results_opti
clf = NearestCentroid()
clf.fit(points, labels)
centroids = clf.centroids_
# calculate performance
score_clust = clustering_score(df_opti, df_val, variables, centroids, X_latent, X_val_latent)
else:
print('clustering model error')
results_cluster.append(score_clust)
df_results['split_'+str(k)] = results_cluster
print(k)
k = k+1
results = df_results.mean(axis=1)
return results
#--------------------------------------------------------------------------------------------------------
# get the mean log score values for each combination of dimensionality reduction and clustering methods
df_opti_none_kmeans = pd.DataFrame(index = number_clusters, columns = ['none_kmeans'])
new_col = f_cluster_opti(variables, dim='none', method = 'kmeans', n_components = 'not needed', n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_none_kmeans['none_kmeans'] = new_col
print('df_opti_none_kmeans done')
df_opti_none_hierarchical = pd.DataFrame(index = number_clusters, columns = ['none_hierarchical'])
new_col = f_cluster_opti(variables, dim='none', method = 'hierarchical', n_components = 'not needed', n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_none_hierarchical['none_hierarchical'] = new_col
print('df_opti_none_hierarchical done')
df_opti_PCA_kmeans = pd.DataFrame(index = number_clusters, columns = ['PCA_kmeans_0', 'PCA_kmeans_1', 'PCA_kmeans_2', 'PCA_kmeans_3', 'PCA_kmeans_4'])
for exvar, j in zip(n_components, [0, 1, 2, 3, 4]):
new_col = f_cluster_opti(variables, dim='PCA', method = 'kmeans', n_components = exvar, n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_PCA_kmeans['PCA_kmeans_'+str(j)] = new_col
print('df_opti_PCA_kmeans done')
df_opti_PCA_hierarchical = pd.DataFrame(index = number_clusters, columns = ['PCA_hierarchical_0', 'PCA_hierarchical_1', 'PCA_hierarchical_2', 'PCA_hierarchical_3', 'PCA_hierarchical_4'])
for exvar, j in zip(n_components, [0, 1, 2, 3, 4]):
new_col = f_cluster_opti(variables, dim='PCA', method = 'hierarchical', n_components = exvar, n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_PCA_hierarchical['PCA_hierarchical_'+str(j)] = new_col
print('df_opti_PCA_hierarchical done')
df_opti_AE_kmeans = pd.DataFrame(index = number_clusters, columns = ['AE_kmeans_0', 'AE_kmeans_1', 'AE_kmeans_2', 'AE_kmeans_3', 'AE_kmeans_4'])
for latdim, j in zip(latent_dimension, [0, 1, 2, 3, 4]):
new_col = f_cluster_opti(variables, dim='AE', method = 'kmeans', n_components = 'not needed', n_latdim = latdim, n_clusters = number_clusters, df_train = df_train)
df_opti_AE_kmeans['AE_kmeans_'+str(j)] = new_col
print('df_opti_AE_kmeans done')
df_opti_AE_hierarchical = pd.DataFrame(index = number_clusters, columns = ['AE_hierarchical_0', 'AE_hierarchical_1', 'AE_hierarchical_2', 'AE_hierarchical_3', 'AE_hierarchical_4'])
for latdim, j in zip(latent_dimension, [0, 1, 2, 3, 4]):
new_col = f_cluster_opti(variables, dim='AE', method = 'hierarchical', n_components = 'not needed', n_latdim = latdim, n_clusters = number_clusters, df_train = df_train)
df_opti_AE_hierarchical['AE_hierarchical_'+str(j)] = new_col
print('df_opti_AE_hierarchical done')
# save results for reuse in graph production if needed
df_opti_none_kmeans.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreLog_none_kmeans_first_vitals.csv')
df_opti_none_hierarchical.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreLog_none_hierarchical_first_vitals.csv')
df_opti_PCA_kmeans.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreLog_PCA_kmeans_first_vitals.csv')
df_opti_PCA_hierarchical.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreLog_PCA_hierarchical_first_vitals.csv')
df_opti_AE_kmeans.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreLog_AE_kmeans_first_vitals.csv')
df_opti_AE_hierarchical.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreLog_AE_hierarchical_first_vitals.csv')
# create data visualisation graphs
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(15,20))
fig.subplots_adjust(hspace=0.4)
axs=axs.ravel()
axs[0].plot(df_opti_none_kmeans['none_kmeans'])
axs[0].set_title('k-means without \ndimensionality reduction', fontsize = 16, pad = 15)
axs[0].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[0].set_ylabel('Custom score value', fontsize = 12, labelpad = 10)
#axs[0].legend(['basic', 'order', 'penalty'], loc = 'upper left')
axs[1].plot(df_opti_none_hierarchical['none_hierarchical'])
axs[1].set_title('hierarchical clustering without \ndimensionality reduction', fontsize = 16, pad = 15)
axs[1].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[1].set_ylabel('Custom score value', fontsize = 12, labelpad = 10)
#axs[1].legend(['basic', 'order', 'penalty'], loc = 'upper left')
axs[2].plot(df_opti_PCA_kmeans[['PCA_kmeans_0', 'PCA_kmeans_1', 'PCA_kmeans_2', 'PCA_kmeans_3', 'PCA_kmeans_4']])
axs[2].set_title('PCA + k-means', fontsize = 16, pad = 15)
axs[2].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[2].set_ylabel('Custom score value', fontsize = 12, labelpad = 10)
axs[2].legend(['exvar: '+str(n_components[0]), 'exvar: '+str(n_components[1]), 'exvar: '+str(n_components[2]), 'exvar: '+str(n_components[3]), 'exvar: '+str(n_components[4])], loc = 'upper left')
axs[3].plot(df_opti_PCA_hierarchical[['PCA_hierarchical_0', 'PCA_hierarchical_1', 'PCA_hierarchical_2', 'PCA_hierarchical_3', 'PCA_hierarchical_4']])
axs[3].set_title('PCA + hierarchical clustering', fontsize = 16, pad = 15)
axs[3].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[3].set_ylabel('Custom score value', fontsize = 12, labelpad = 10)
axs[3].legend(['exvar: '+str(n_components[0]), 'exvar: '+str(n_components[1]), 'exvar: '+str(n_components[2]), 'exvar: '+str(n_components[3]), 'exvar: '+str(n_components[4])], loc = 'upper left')
axs[4].plot(df_opti_AE_kmeans[['AE_kmeans_0', 'AE_kmeans_1', 'AE_kmeans_2', 'AE_kmeans_3', 'AE_kmeans_4']])
axs[4].set_title('Autoencoder + k-means', fontsize = 16, pad = 15)
axs[4].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[4].set_ylabel('Custom score value', fontsize = 12, labelpad = 10)
axs[4].legend(['latdim: '+str(latent_dimension[0]), 'latdim: '+str(latent_dimension[1]), 'latdim: '+str(latent_dimension[2]), 'latdim: '+str(latent_dimension[3]), 'latdim: '+str(latent_dimension[4])], loc = 'upper left')
axs[5].plot(df_opti_AE_hierarchical[['AE_hierarchical_0', 'AE_hierarchical_1', 'AE_hierarchical_2', 'AE_hierarchical_3', 'AE_hierarchical_4']])
axs[5].set_title('Autoencoder + hierarchical clustering', fontsize = 16, pad = 15)
axs[5].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[5].set_ylabel('Custom score value', fontsize = 12, labelpad = 10)
axs[5].legend(['latdim: '+str(latent_dimension[0]), 'latdim: '+str(latent_dimension[1]), 'latdim: '+str(latent_dimension[2]), 'latdim: '+str(latent_dimension[3]), 'latdim: '+str(latent_dimension[4])], loc = 'upper left')
plt.savefig('C:/Users/baptiste.vasey/Documents/Thesis/Figures/SuppFigScoreLog_first_vitals_' + pop.name + '.pdf', bbox_inches = 'tight')
plt.show()
Values for hierarchical clustering are dysplayed for comparison only - a different cluster optimisation method was used in practice
start=datetime.datetime.now()
# custom score for cluster number optimisation using recall at position. Should be maximised.
def pres_recommendation_perf(df_train, df_test, n_clusters):
cluster_prescriptions_top_10 = {}
for c in range(0, n_clusters):
target=df_train[df_train['cluster'] == c]['new_prescriptions_grouped']
pres_list =[]
for x in target:
pres_list.extend(x)
df_new_drugs = pd.DataFrame(pres_list,columns=['new_prescriptions_grouped'])
df_cluster_pres = df_new_drugs['new_prescriptions_grouped'].value_counts().to_frame()
df_cluster_pres.columns = ['prevalence']
df_cluster_pres['rank'] = df_cluster_pres['prevalence'].rank(ascending=False) # dataframe with prescriptions as index and rank as column
cluster_top_10 = df_cluster_pres[df_cluster_pres['rank'] <= 10].index.to_list()
cluster_prescriptions_top_10["cluster_" + str(c)] = cluster_top_10
def intersection(lst1, lst2):
return list(set(lst1) & set(lst2))
overall_perf_ = 0
for i in df_test.index:
cluster = df_test.loc[i, 'cluster']
top_10_cluster = cluster_prescriptions_top_10["cluster_" + str(cluster)]
prescriptions = df_test.loc[i, 'new_prescriptions_grouped']
recommended_pres = intersection(prescriptions, top_10_cluster)
performance = len(recommended_pres)/len(prescriptions)
overall_perf_ += performance
overall_perf = overall_perf_/len(df_test.index)
return overall_perf
#dim_reduction = ['none', 'PCA', 'AE']
#clustering_method = ['kmeans', 'hierarchical']
# define the range of cluster number, latent dimensions (autoencoder), and variances explained by the principal components to include in the optimisation
number_clusters = range(2,31)
latent_dimension = range(int(len(variables)/5), len(variables), int(len(variables)/5))
n_components = [0.75, 0.8, 0.85, 0.9, 0.95]
np.random.seed(1301)
random.set_seed(2206)
def f_cluster_opti(variables, dim, method, n_components, n_latdim, n_clusters, df_train):
# select optimal autoencoder architecture according to the input features set (see autoencoder parameter tuning)
n_neurons = 64
n_layers = 2
score_method = []
# kfold crossvalidation
X_gss = df_train
y_gss = df_train.number_prescriptions_groups # only to allow function to run
group = df_train.subject_id # prevent data leakage from same patient with several hospital admission
group_kfold = GroupKFold(n_splits=8) # equivalent to a 70:10:20 split overall
group_kfold.get_n_splits()
df_results = pd.DataFrame(index = n_clusters)
k = 1
for opti_index, val_index in group_kfold.split(X_gss, y_gss, groups = group):
df_opti = X_gss.iloc[opti_index].copy()
df_val = X_gss.iloc[val_index].copy()
df_opti.reset_index(inplace=True, drop=True)
df_val.reset_index(inplace=True, drop=True)
X = df_opti[variables]
X_val = df_val[variables]
# standardisation
X_unscaled = X.to_numpy()
X_scaled = StandardScaler().fit_transform(X_unscaled)
X_val_unscaled = X_val.to_numpy()
X_val_scaled = StandardScaler().fit_transform(X_val_unscaled)
# dimensionality reduction
if dim == 'PCA':
pca = PCA(n_components=n_components, svd_solver='full')
X_latent = pca.fit_transform(X_scaled)
X_val_latent = pca.transform(X_val_scaled)
elif dim == 'AE':
dimension = len(variables)
batch = 32
# define encoder architecture
input_data = keras.Input(shape=(dimension,))
encoded = layers.Dense(n_neurons, activation='relu')(input_data)
for encoder_layer in range(1, n_layers):
encoded = layers.Dense(n_neurons, activation='relu')(encoded)
# latent space
encoded = layers.Dense(n_latdim, activation='relu')(encoded)
# define decoder architecture
decoded = layers.Dense(n_neurons, activation='relu')(encoded)
for decoder_layer in range(1, n_layers):
decoded = layers.Dense(n_neurons, activation='relu')(decoded)
decoded = layers.Dense(dimension)(decoded)
autoencoder = keras.Model(input_data, decoded)
encoder = keras.Model(input_data, encoded)
autoencoder.compile(optimizer='adam', loss='mean_absolute_error')
# fit autoencoder
autoencoder.fit(X_scaled, X_scaled, epochs=100, batch_size=batch, shuffle=True, verbose = 0)
X_latent = encoder.predict(X_scaled)
X_val_latent = encoder.predict(X_val_scaled)
elif dim == 'none':
X_latent = X_scaled
X_val_latent = X_val_scaled
else:
print('dimensionality reduction error')
# clustering
results_cluster = []
for n in n_clusters:
if method == 'kmeans':
seed_kmeans = np.random.randint(1,1001) # increase cluster search diversity
kmeansModel = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100, n_clusters=n, n_init=10,
random_state=seed_kmeans, tol=0.0001, verbose=0)
results_opti = kmeansModel.fit_predict(X_latent)
results_val = kmeansModel.predict(X_val_latent)
df_opti.loc[:,'cluster'] = results_opti.tolist()
df_val.loc[:,'cluster'] = results_val.tolist()
score_clust = pres_recommendation_perf(df_train = df_opti, df_test = df_val, n_clusters = n)
elif method == 'hierarchical':
h_clust = AgglomerativeClustering(n_clusters = n, affinity = 'euclidean', linkage ='average')
results_opti = h_clust.fit_predict(X_latent)
df_opti.loc[:,'cluster'] = results_opti.tolist()
for i in df_val.index:
dist = np.square(X_latent - X_val_latent[i]).sum(axis=1)
df_opti['dist']=dist
target_cluster = np.argmin(df_opti.groupby(['cluster'])['dist'].mean())
df_val.loc[i,'cluster'] = target_cluster
df_val['cluster'] = df_val['cluster'].apply(np.int64)
score_clust = pres_recommendation_perf(df_train = df_opti, df_test = df_val, n_clusters = n)
else:
print('clustering model error')
results_cluster.append(score_clust)
df_results['split_'+str(k)] = results_cluster
print(k)
k = k+1
results = df_results.mean(axis=1)
return results
#--------------------------------------------------------------------------------------------------------
# get the mean recall at position values for each combination of dimensionality reduction and clustering methods
df_opti_none_kmeans = pd.DataFrame(index = number_clusters, columns = ['none_kmeans'])
new_col = f_cluster_opti(variables, dim='none', method = 'kmeans', n_components = 'not needed', n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_none_kmeans['none_kmeans'] = new_col
print('df_opti_none_kmeans done')
df_opti_none_hierarchical = pd.DataFrame(index = number_clusters, columns = ['none_hierarchical'])
new_col = f_cluster_opti(variables, dim='none', method = 'hierarchical', n_components = 'not needed', n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_none_hierarchical['none_hierarchical'] = new_col
print('df_opti_none_hierarchical done')
df_opti_PCA_kmeans = pd.DataFrame(index = number_clusters, columns = ['PCA_kmeans_0', 'PCA_kmeans_1', 'PCA_kmeans_2', 'PCA_kmeans_3', 'PCA_kmeans_4'])
for exvar, j in zip(n_components, [0, 1, 2, 3, 4]):
new_col = f_cluster_opti(variables, dim='PCA', method = 'kmeans', n_components = exvar, n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_PCA_kmeans['PCA_kmeans_'+str(j)] = new_col
print('df_opti_PCA_kmeans done')
df_opti_PCA_hierarchical = pd.DataFrame(index = number_clusters, columns = ['PCA_hierarchical_0', 'PCA_hierarchical_1', 'PCA_hierarchical_2', 'PCA_hierarchical_3', 'PCA_hierarchical_4'])
for exvar, j in zip(n_components, [0, 1, 2, 3, 4]):
new_col = f_cluster_opti(variables, dim='PCA', method = 'hierarchical', n_components = exvar, n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_PCA_hierarchical['PCA_hierarchical_'+str(j)] = new_col
print('df_opti_PCA_hierarchical done')
df_opti_AE_kmeans = pd.DataFrame(index = number_clusters, columns = ['AE_kmeans_0', 'AE_kmeans_1', 'AE_kmeans_2', 'AE_kmeans_3', 'AE_kmeans_4'])
for latdim, j in zip(latent_dimension, [0, 1, 2, 3, 4]):
new_col = f_cluster_opti(variables, dim='AE', method = 'kmeans', n_components = 'not needed', n_latdim = latdim, n_clusters = number_clusters, df_train = df_train)
df_opti_AE_kmeans['AE_kmeans_'+str(j)] = new_col
print('df_opti_AE_kmeans done')
df_opti_AE_hierarchical = pd.DataFrame(index = number_clusters, columns = ['AE_hierarchical_0', 'AE_hierarchical_1', 'AE_hierarchical_2', 'AE_hierarchical_3', 'AE_hierarchical_4'])
for latdim, j in zip(latent_dimension, [0, 1, 2, 3, 4]):
new_col = f_cluster_opti(variables, dim='AE', method = 'hierarchical', n_components = 'not needed', n_latdim = latdim, n_clusters = number_clusters, df_train = df_train)
df_opti_AE_hierarchical['AE_hierarchical_'+str(j)] = new_col
print('df_opti_AE_hierarchical done')
# save results for reuse in graph production if needed
df_opti_none_kmeans.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_Recall_none_kmeans_first_vitals.csv')
df_opti_none_hierarchical.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_Recall_none_hierarchical_first_vitals.csv')
df_opti_PCA_kmeans.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_Recall_PCA_kmeans_first_vitals.csv')
df_opti_PCA_hierarchical.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_Recall_PCA_hierarchical_first_vitals.csv')
df_opti_AE_kmeans.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_Recall_AE_kmeans_first_vitals.csv')
df_opti_AE_hierarchical.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_Recall_AE_hierarchical_first_vitals.csv')
# create data visualisation graphs
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(15,20))
fig.subplots_adjust(hspace=0.4)
axs=axs.ravel()
axs[0].plot(df_opti_none_kmeans['none_kmeans'])
axs[0].set_title('k-means without \ndimensionality reduction', fontsize = 16, pad = 15)
axs[0].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[0].set_ylabel('Recall at position 10', fontsize = 12, labelpad = 10)
#axs[0].legend(['basic', 'order', 'penalty'], loc = 'upper left')
axs[1].plot(df_opti_none_hierarchical['none_hierarchical'])
axs[1].set_title('hierarchical clustering without \ndimensionality reduction', fontsize = 16, pad = 15)
axs[1].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[1].set_ylabel('Recall at position 10', fontsize = 12, labelpad = 10)
#axs[1].legend(['basic', 'order', 'penalty'], loc = 'upper left')
axs[2].plot(df_opti_PCA_kmeans[['PCA_kmeans_0', 'PCA_kmeans_1', 'PCA_kmeans_2', 'PCA_kmeans_3', 'PCA_kmeans_4']])
axs[2].set_title('PCA + k-means', fontsize = 16, pad = 15)
axs[2].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[2].set_ylabel('Recall at position 10', fontsize = 12, labelpad = 10)
axs[2].legend(['exvar: '+str(n_components[0]), 'exvar: '+str(n_components[1]), 'exvar: '+str(n_components[2]), 'exvar: '+str(n_components[3]), 'exvar: '+str(n_components[4])], loc = 'upper left')
axs[3].plot(df_opti_PCA_hierarchical[['PCA_hierarchical_0', 'PCA_hierarchical_1', 'PCA_hierarchical_2', 'PCA_hierarchical_3', 'PCA_hierarchical_4']])
axs[3].set_title('PCA + hierarchical clustering', fontsize = 16, pad = 15)
axs[3].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[3].set_ylabel('Recall at position 10', fontsize = 12, labelpad = 10)
axs[3].legend(['exvar: '+str(n_components[0]), 'exvar: '+str(n_components[1]), 'exvar: '+str(n_components[2]), 'exvar: '+str(n_components[3]), 'exvar: '+str(n_components[4])], loc = 'upper left')
axs[4].plot(df_opti_AE_kmeans[['AE_kmeans_0', 'AE_kmeans_1', 'AE_kmeans_2', 'AE_kmeans_3', 'AE_kmeans_4']])
axs[4].set_title('Autoencoder + k-means', fontsize = 16, pad = 15)
axs[4].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[4].set_ylabel('Recall at position 10', fontsize = 12, labelpad = 10)
axs[4].legend(['latdim: '+str(latent_dimension[0]), 'latdim: '+str(latent_dimension[1]), 'latdim: '+str(latent_dimension[2]), 'latdim: '+str(latent_dimension[3]), 'latdim: '+str(latent_dimension[4])], loc = 'upper left')
axs[5].plot(df_opti_AE_hierarchical[['AE_hierarchical_0', 'AE_hierarchical_1', 'AE_hierarchical_2', 'AE_hierarchical_3', 'AE_hierarchical_4']])
axs[5].set_title('Autoencoder + hierarchical clustering', fontsize = 16, pad = 15)
axs[5].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[5].set_ylabel('Recall at position 10', fontsize = 12, labelpad = 10)
axs[5].legend(['latdim: '+str(latent_dimension[0]), 'latdim: '+str(latent_dimension[1]), 'latdim: '+str(latent_dimension[2]), 'latdim: '+str(latent_dimension[3]), 'latdim: '+str(latent_dimension[4])], loc = 'upper left')
plt.savefig('C:/Users/baptiste.vasey/Documents/Thesis/Figures/SuppFigRecall_first_vitals_' + pop.name + '.pdf', bbox_inches = 'tight')
plt.show()
Values for hierarchical clustering are dysplayed for comparison only - a different cluster optimisation method was used in practice
start=datetime.datetime.now()
# custom score for cluster number optimisation using the custom proportion score. Should be maximised.
def pres_recommendation_score_prop(df_train, df_test, n_clusters):
cluster_prescriptions_top_10_prop = {}
for c in range(0, n_clusters):
target=df_train[df_train['cluster'] == c]['new_prescriptions_grouped']
pres_list =[]
for x in target:
pres_list.extend(x)
number_pres_tot = len(pres_list)
df_new_drugs = pd.DataFrame(pres_list,columns=['new_prescriptions_grouped'])
df_cluster_pres = df_new_drugs['new_prescriptions_grouped'].value_counts().to_frame()
df_cluster_pres.columns = ['prevalence']
df_cluster_pres['proportion'] = df_cluster_pres['prevalence']/number_pres_tot
df_cluster_pres['%_hadm_receiving'] = df_cluster_pres['prevalence']/len(df_train[df_train['cluster'] == c].index)
df_cluster_pres['rank'] = df_cluster_pres['prevalence'].rank(ascending=False) # dataframe with prescriptions as index and rank as column
cluster_top_10_prop = df_cluster_pres[df_cluster_pres['rank'] <= 10]
cluster_prescriptions_top_10_prop["cluster_" + str(c)] = cluster_top_10_prop
score = 0
for i in df_test.index:
cluster = df_test.loc[i, 'cluster']
top_10_cluster = cluster_prescriptions_top_10_prop["cluster_" + str(cluster)].index.to_list()
score_ = 0
number_prescription = len(df_test.loc[i, 'new_prescriptions_grouped'])
for drug in df_test.loc[i, 'new_prescriptions_grouped']:
if drug in top_10_cluster:
score_drug = cluster_prescriptions_top_10_prop["cluster_" + str(cluster)].loc[drug, '%_hadm_receiving']
else:
score_drug = 0
score_ += score_drug
# average score in order to compare patients with different number of prescriptions
score_i = score_/number_prescription
score += score_i
final_score = score/len(df_test.index)
return final_score
#dim_reduction = ['none', 'PCA', 'AE']
#clustering_method = ['kmeans', 'hierarchical']
# define the range of cluster number, latent dimensions (autoencoder), and variances explained by the principal components to include in the optimisation
number_clusters = range(2,31)
latent_dimension = range(int(len(variables)/5), len(variables), int(len(variables)/5))
n_components = [0.75, 0.8, 0.85, 0.9, 0.95]
np.random.seed(1301)
random.set_seed(2206)
def f_cluster_opti(variables, dim, method, n_components, n_latdim, n_clusters, df_train):
# select optimal autoencoder architecture according to the input features set (see autoencoder parameter tuning)
n_neurons = 64
n_layers = 2
score_method = []
# kfold crossvalidation
X_gss = df_train
y_gss = df_train.number_prescriptions_groups # only to allow function to run
group = df_train.subject_id # prevent data leakage from same patient with several hospital admission
group_kfold = GroupKFold(n_splits=8) # equivalent to a 70:10:20 split overall
group_kfold.get_n_splits()
df_results = pd.DataFrame(index = n_clusters)
k = 1
for opti_index, val_index in group_kfold.split(X_gss, y_gss, groups = group):
df_opti = X_gss.iloc[opti_index].copy()
df_val = X_gss.iloc[val_index].copy()
df_opti.reset_index(inplace=True, drop=True)
df_val.reset_index(inplace=True, drop=True)
X = df_opti[variables]
X_val = df_val[variables]
# standardisation
X_unscaled = X.to_numpy()
X_scaled = StandardScaler().fit_transform(X_unscaled)
X_val_unscaled = X_val.to_numpy()
X_val_scaled = StandardScaler().fit_transform(X_val_unscaled)
# dimensionality reduction
if dim == 'PCA':
pca = PCA(n_components=n_components, svd_solver='full')
X_latent = pca.fit_transform(X_scaled)
X_val_latent = pca.transform(X_val_scaled)
elif dim == 'AE':
dimension = len(variables)
batch = 32
# define encoder architecture
input_data = keras.Input(shape=(dimension,))
encoded = layers.Dense(n_neurons, activation='relu')(input_data)
for encoder_layer in range(1, n_layers):
encoded = layers.Dense(n_neurons, activation='relu')(encoded)
# latent space
encoded = layers.Dense(n_latdim, activation='relu')(encoded)
# define decoder architecture
decoded = layers.Dense(n_neurons, activation='relu')(encoded)
for decoder_layer in range(1, n_layers):
decoded = layers.Dense(n_neurons, activation='relu')(decoded)
decoded = layers.Dense(dimension)(decoded)
autoencoder = keras.Model(input_data, decoded)
encoder = keras.Model(input_data, encoded)
autoencoder.compile(optimizer='adam', loss='mean_absolute_error')
# fit autoencoder
autoencoder.fit(X_scaled, X_scaled, epochs=100, batch_size=batch, shuffle=True, verbose = 0)
X_latent = encoder.predict(X_scaled)
X_val_latent = encoder.predict(X_val_scaled)
elif dim == 'none':
X_latent = X_scaled
X_val_latent = X_val_scaled
else:
print('dimensionality reduction error')
# clustering
results_cluster = []
for n in n_clusters:
if method == 'kmeans':
seed_kmeans = np.random.randint(1,1001) # increase cluster search diversity
kmeansModel = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100, n_clusters=n, n_init=10,
random_state=seed_kmeans, tol=0.0001, verbose=0)
results_opti = kmeansModel.fit_predict(X_latent)
results_val = kmeansModel.predict(X_val_latent)
df_opti.loc[:,'cluster'] = results_opti.tolist()
df_val.loc[:,'cluster'] = results_val.tolist()
score_clust = pres_recommendation_score_prop(df_train = df_opti, df_test = df_val, n_clusters = n)
elif method == 'hierarchical':
h_clust = AgglomerativeClustering(n_clusters = n, affinity = 'euclidean', linkage ='ward')
results_opti = h_clust.fit_predict(X_latent)
df_opti.loc[:,'cluster'] = results_opti.tolist()
for i in df_val.index:
dist = np.square(X_latent - X_val_latent[i]).sum(axis=1)
df_opti['dist']=dist
target_cluster = np.argmin(df_opti.groupby(['cluster'])['dist'].mean())
df_val.loc[i,'cluster'] = target_cluster
df_val['cluster'] = df_val['cluster'].apply(np.int64)
score_clust = pres_recommendation_score_prop(df_train = df_opti, df_test = df_val, n_clusters = n)
else:
print('clustering model error')
results_cluster.append(score_clust)
df_results['split_'+str(k)] = results_cluster
print(k)
k = k+1
results = df_results.mean(axis=1)
return results
#--------------------------------------------------------------------------------------------------------
# get the mean proportion score values for each combination of dimensionality reduction and clustering methods
df_opti_none_kmeans = pd.DataFrame(index = number_clusters, columns = ['none_kmeans'])
new_col = f_cluster_opti(variables, dim='none', method = 'kmeans', n_components = 'not needed', n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_none_kmeans['none_kmeans'] = new_col
print('df_opti_none_kmeans done')
df_opti_none_hierarchical = pd.DataFrame(index = number_clusters, columns = ['none_hierarchical'])
new_col = f_cluster_opti(variables, dim='none', method = 'hierarchical', n_components = 'not needed', n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_none_hierarchical['none_hierarchical'] = new_col
print('df_opti_none_hierarchical done')
df_opti_PCA_kmeans = pd.DataFrame(index = number_clusters, columns = ['PCA_kmeans_0', 'PCA_kmeans_1', 'PCA_kmeans_2', 'PCA_kmeans_3', 'PCA_kmeans_4'])
for exvar, j in zip(n_components, [0, 1, 2, 3, 4]):
new_col = f_cluster_opti(variables, dim='PCA', method = 'kmeans', n_components = exvar, n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_PCA_kmeans['PCA_kmeans_'+str(j)] = new_col
print('df_opti_PCA_kmeans done')
df_opti_PCA_hierarchical = pd.DataFrame(index = number_clusters, columns = ['PCA_hierarchical_0', 'PCA_hierarchical_1', 'PCA_hierarchical_2', 'PCA_hierarchical_3', 'PCA_hierarchical_4'])
for exvar, j in zip(n_components, [0, 1, 2, 3, 4]):
new_col = f_cluster_opti(variables, dim='PCA', method = 'hierarchical', n_components = exvar, n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_PCA_hierarchical['PCA_hierarchical_'+str(j)] = new_col
print('df_opti_PCA_hierarchical done')
df_opti_AE_kmeans = pd.DataFrame(index = number_clusters, columns = ['AE_kmeans_0', 'AE_kmeans_1', 'AE_kmeans_2', 'AE_kmeans_3', 'AE_kmeans_4'])
for latdim, j in zip(latent_dimension, [0, 1, 2, 3, 4]):
new_col = f_cluster_opti(variables, dim='AE', method = 'kmeans', n_components = 'not needed', n_latdim = latdim, n_clusters = number_clusters, df_train = df_train)
df_opti_AE_kmeans['AE_kmeans_'+str(j)] = new_col
print('df_opti_AE_kmeans done')
df_opti_AE_hierarchical = pd.DataFrame(index = number_clusters, columns = ['AE_hierarchical_0', 'AE_hierarchical_1', 'AE_hierarchical_2', 'AE_hierarchical_3', 'AE_hierarchical_4'])
for latdim, j in zip(latent_dimension, [0, 1, 2, 3, 4]):
new_col = f_cluster_opti(variables, dim='AE', method = 'hierarchical', n_components = 'not needed', n_latdim = latdim, n_clusters = number_clusters, df_train = df_train)
df_opti_AE_hierarchical['AE_hierarchical_'+str(j)] = new_col
print('df_opti_AE_hierarchical done')
# save results for reuse in graph production if needed
df_opti_none_kmeans.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreProp_none_kmeans_first_vitals.csv')
df_opti_none_hierarchical.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreProp_none_hierarchical_first_vitals.csv')
df_opti_PCA_kmeans.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreProp_PCA_kmeans_first_vitals.csv')
df_opti_PCA_hierarchical.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreProp_PCA_hierarchical_first_vitals.csv')
df_opti_AE_kmeans.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreProp_AE_kmeans_first_vitals.csv')
df_opti_AE_hierarchical.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreProp_AE_hierarchical_first_vitals.csv')
# create data visualisation graphs
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(15,20))
fig.subplots_adjust(hspace=0.4)
axs=axs.ravel()
axs[0].plot(df_opti_none_kmeans['none_kmeans'])
axs[0].set_title('k-means without \ndimensionality reduction', fontsize = 16, pad = 15)
axs[0].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[0].set_ylabel('Custom proportion score value', fontsize = 12, labelpad = 10)
#axs[0].legend(['basic', 'order', 'penalty'], loc = 'upper left')
axs[1].plot(df_opti_none_hierarchical['none_hierarchical'])
axs[1].set_title('hierarchical clustering without \ndimensionality reduction', fontsize = 16, pad = 15)
axs[1].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[1].set_ylabel('Custom proportion score value', fontsize = 12, labelpad = 10)
#axs[1].legend(['basic', 'order', 'penalty'], loc = 'upper left')
axs[2].plot(df_opti_PCA_kmeans[['PCA_kmeans_0', 'PCA_kmeans_1', 'PCA_kmeans_2', 'PCA_kmeans_3', 'PCA_kmeans_4']])
axs[2].set_title('PCA + k-means', fontsize = 16, pad = 15)
axs[2].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[2].set_ylabel('Custom proportion score value', fontsize = 12, labelpad = 10)
axs[2].legend(['exvar: '+str(n_components[0]), 'exvar: '+str(n_components[1]), 'exvar: '+str(n_components[2]), 'exvar: '+str(n_components[3]), 'exvar: '+str(n_components[4])], loc = 'upper left')
axs[3].plot(df_opti_PCA_hierarchical[['PCA_hierarchical_0', 'PCA_hierarchical_1', 'PCA_hierarchical_2', 'PCA_hierarchical_3', 'PCA_hierarchical_4']])
axs[3].set_title('PCA + hierarchical clustering', fontsize = 16, pad = 15)
axs[3].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[3].set_ylabel('Custom proportion score value', fontsize = 12, labelpad = 10)
axs[3].legend(['exvar: '+str(n_components[0]), 'exvar: '+str(n_components[1]), 'exvar: '+str(n_components[2]), 'exvar: '+str(n_components[3]), 'exvar: '+str(n_components[4])], loc = 'upper left')
axs[4].plot(df_opti_AE_kmeans[['AE_kmeans_0', 'AE_kmeans_1', 'AE_kmeans_2', 'AE_kmeans_3', 'AE_kmeans_4']])
axs[4].set_title('Autoencoder + k-means', fontsize = 16, pad = 15)
axs[4].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[4].set_ylabel('Custom proportion score value', fontsize = 12, labelpad = 10)
axs[4].legend(['latdim: '+str(latent_dimension[0]), 'latdim: '+str(latent_dimension[1]), 'latdim: '+str(latent_dimension[2]), 'latdim: '+str(latent_dimension[3]), 'latdim: '+str(latent_dimension[4])], loc = 'upper left')
axs[5].plot(df_opti_AE_hierarchical[['AE_hierarchical_0', 'AE_hierarchical_1', 'AE_hierarchical_2', 'AE_hierarchical_3', 'AE_hierarchical_4']])
axs[5].set_title('Autoencoder + hierarchical clustering', fontsize = 16, pad = 15)
axs[5].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[5].set_ylabel('Custom proportion score value', fontsize = 12, labelpad = 10)
axs[5].legend(['latdim: '+str(latent_dimension[0]), 'latdim: '+str(latent_dimension[1]), 'latdim: '+str(latent_dimension[2]), 'latdim: '+str(latent_dimension[3]), 'latdim: '+str(latent_dimension[4])], loc = 'upper left')
plt.savefig('C:/Users/baptiste.vasey/Documents/Thesis/Figures/SuppFigScoreProp_first_vitals_' + pop.name + '.pdf', bbox_inches = 'tight')
plt.show()
# recall at position 10
def random_performance(df_train, df_test):
np.random.seed(2206)
unique_set = set()
# create a set of all prescription groups
for i in df_train.index:
index_pres = df_train.new_prescriptions_grouped[i]
new_pres = set(index_pres) - unique_set
unique_set.update(new_pres)
def intersection(lst1, lst2):
return list(set(lst1) & set(lst2))
overall_perf_ = 0
for i in df_test.index:
# pick a random sample of 10 prescriptions without replacement
random_pick = np.random.choice(list(unique_set), 10, replace=False)
# calculate the recall at position 10 value
prescriptions = df_test.loc[i, 'new_prescriptions_grouped']
recommended_pres = intersection(prescriptions, random_pick)
performance = len(recommended_pres)/len(prescriptions)
overall_perf_ += performance
overall_perf = overall_perf_/len(df_test.index)
return overall_perf
# score proportion
def random_performance_prop(df_train, df_test):
np.random.seed(2206)
unique_set = set()
create a set of all prescription groups
for i in df_train.index:
index_pres = df_train.new_prescriptions_grouped[i]
new_pres = set(index_pres) - unique_set
unique_set.update(new_pres)
# gets proportion of prescriptions in train set
target=df_train['new_prescriptions_grouped']
pres_list =[]
for x in target:
pres_list.extend(x)
number_pres_tot = len(pres_list)
df_new_drugs = pd.DataFrame(pres_list,columns=['new_prescriptions_grouped'])
df_pres = df_new_drugs['new_prescriptions_grouped'].value_counts().to_frame()
df_pres.columns = ['prevalence']
df_pres['proportion'] = df_pres['prevalence']/number_pres_tot
df_pres['%_hadm_receiving'] = df_pres['prevalence']/len(df_train.index)
score = 0
for i in df_test.index:
# pick a random sample of 10 prescriptions without replacement
random_pick = np.random.choice(list(unique_set), 10, replace=False)
# calculate the proportion score
score_ = 0
number_prescription = len(df_test.loc[i, 'new_prescriptions_grouped'])
for drug in df_test.loc[i, 'new_prescriptions_grouped']:
if drug in random_pick:
score_drug = df_pres.loc[drug, '%_hadm_receiving']
else:
score_drug = 0
score_ += score_drug
# average score in order to compare patients with different number of prescriptions
score_i = score_/number_prescription
score += score_i
final_score = score/len(df_test.index)
return final_score
print('Performance of random selected list of 10 prescriptions: ' +str(random_performance(df_train, df_test)))
print('Performance of random selected list of 10 prescriptions with proportion: ' +str(random_performance_prop(df_train, df_test)))
# recall at position 10
def random_performance(df_train, df_test):
np.random.seed(2206)
unique_set = set()
# create a set of all procedure groups
for i in df_train.index:
index_proc = df_train.new_procedures_grouped[i]
new_proc = set(index_proc) - unique_set
unique_set.update(new_proc)
def intersection(lst1, lst2):
return list(set(lst1) & set(lst2))
overall_perf_ = 0
for i in df_test.index:
# pick a random sample of 10 procedures without replacement
random_pick = np.random.choice(list(unique_set), 10, replace=False)
# calculate the recall at position 10 value
procedures = df_test.loc[i, 'new_procedures_grouped']
recommended_proc = intersection(procedures, random_pick)
performance = len(recommended_proc)/len(procedures)
overall_perf_ += performance
overall_perf = overall_perf_/len(df_test.index)
return overall_perf
# score proportion
def random_performance_prop(df_train, df_test):
np.random.seed(2206)
unique_set = set()
# create a set of all procedure groups
for i in df_train.index:
index_proc = df_train.new_procedures_grouped[i]
new_proc = set(index_proc) - unique_set
unique_set.update(new_proc)
# gets proportion of procedures in train set
target=df_train['new_procedures_grouped']
proc_list =[]
for x in target:
proc_list.extend(x)
number_proc_tot = len(proc_list)
df_new_drugs = pd.DataFrame(proc_list,columns=['new_procedures_grouped'])
df_proc = df_new_drugs['new_procedures_grouped'].value_counts().to_frame()
df_proc.columns = ['prevalence']
df_proc['proportion'] = df_proc['prevalence']/number_proc_tot
df_proc['%_hadm_receiving'] = df_proc['prevalence']/len(df_train.index)
score = 0
for i in df_test.index:
# pick a random sample of 10 procedures without replacement
random_pick = np.random.choice(list(unique_set), 10, replace=False)
# calculate the proportion score
score_ = 0
number_procedure = len(df_test.loc[i, 'new_procedures_grouped'])
for drug in df_test.loc[i, 'new_procedures_grouped']:
if drug in random_pick:
score_drug = df_proc.loc[drug, '%_hadm_receiving']
else:
score_drug = 0
score_ += score_drug
# average score in order to compare patients with different number of procedures
score_i = score_/number_procedure
score += score_i
final_score = score/len(df_test.index)
return final_score
print('Performance of random selected list of 10 procedures: ' +str(random_performance(df_train, df_test)))
print('Performance of random selected list of 10 procedures with proportion: ' +str(random_performance_prop(df_train, df_test)))
# recall at position 10
def top10_performance(df_train, df_test):
#creates a list of all prescribed drug groups
target=df_train['new_prescriptions_grouped']
pres_list =[]
for x in target:
pres_list.extend(x)
# identify the top 10 most prescribed drug groups
top_10 = [pres for pres, cnt in Counter(pres_list).most_common(10)]
print(top_10)
def intersection(lst1, lst2):
return list(set(lst1) & set(lst2))
# calculate the recall at position 10 value
overall_perf_ = []
for i in df_test.index:
prescriptions = df_test.loc[i, 'new_prescriptions_grouped']
recommended_pres = intersection(prescriptions, top_10)
performance = len(recommended_pres)/len(prescriptions)
overall_perf_.append(performance)
overall_perf = np.mean(overall_perf_)
return overall_perf
# score proportion
def top10_performance_prop(df_train, df_test):
#creates a list of all precribed drug groups
target=df_train['new_prescriptions_grouped']
pres_list =[]
for x in target:
pres_list.extend(x)
# identify the top 10 most prescribed drug groups and the proportion of admissions receiving them
number_pres_tot = len(pres_list)
df_new_drugs = pd.DataFrame(pres_list,columns=['new_prescriptions_grouped'])
df_pres = df_new_drugs['new_prescriptions_grouped'].value_counts().to_frame()
df_pres.columns = ['prevalence']
df_pres['proportion'] = df_pres['prevalence']/number_pres_tot
df_pres['%_hadm_receiving'] = df_pres['prevalence']/len(df_train.index)
df_pres['rank'] = df_pres['prevalence'].rank(ascending=False) # dataframe with prescriptions as index and rank as column
top_10 = df_pres[df_pres['rank'] <= 10].index.to_list()
print(df_pres[df_pres['rank'] <= 10])
# calculate the proportion score
score = 0
for i in df_test.index:
score_ = 0
number_prescription = len(df_test.loc[i, 'new_prescriptions_grouped'])
for drug in df_test.loc[i, 'new_prescriptions_grouped']:
if drug in top_10:
score_drug = df_pres.loc[drug, '%_hadm_receiving']
else:
score_drug = 0
score_ += score_drug
# average score in order to compare patients with different number of prescriptions
score_i = score_/number_prescription
score += score_i
final_score = score/len(df_test.index)
return final_score
print('Performance of most prescribed list of 10 prescriptions: ' +str(top10_performance(df_train, df_test)))
print('Performance of most prescribed list of 10 prescriptions with proportion: ' +str(top10_performance_prop(df_train, df_test)))
# recall at position 10
def top10_performance(df_train, df_test):
#creates a list of all prescribed procedure groups
target=df_train['new_procedures_grouped']
proc_list =[]
for x in target:
proc_list.extend(x)
# identify the top 10 most prescribed procedure groups
top_10 = [proc for proc, cnt in Counter(proc_list).most_common(10)]
print(top_10)
def intersection(lst1, lst2):
return list(set(lst1) & set(lst2))
# calculate the recall at position 10 value
overall_perf_ = []
for i in df_test.index:
procedures = df_test.loc[i, 'new_procedures_grouped']
recommended_proc = intersection(procedures, top_10)
performance = len(recommended_proc)/len(procedures)
overall_perf_.append(performance)
overall_perf = np.mean(overall_perf_)
return overall_perf
# score proportion
def top10_performance_prop(df_train, df_test):
#creates a list of all precribed drug groups
target=df_train['new_procedures_grouped']
proc_list =[]
for x in target:
proc_list.extend(x)
# identify the top 10 most prescribed procedure groups and the proportion of admissions receiving them
number_proc_tot = len(proc_list)
df_new_drugs = pd.DataFrame(proc_list,columns=['new_procedures_grouped'])
df_proc = df_new_drugs['new_procedures_grouped'].value_counts().to_frame()
df_proc.columns = ['prevalence']
df_proc['proportion'] = df_proc['prevalence']/number_proc_tot
df_proc['%_hadm_receiving'] = df_proc['prevalence']/len(df_train.index)
df_proc['rank'] = df_proc['prevalence'].rank(ascending=False) # dataframe with procedures as index and rank as column
top_10 = df_proc[df_proc['rank'] <= 10].index.to_list()
print(df_proc[df_proc['rank'] <= 10])
# calculate the proportion score
score = 0
for i in df_test.index:
score_ = 0
number_procedure = len(df_test.loc[i, 'new_procedures_grouped'])
for drug in df_test.loc[i, 'new_procedures_grouped']:
if drug in top_10:
score_drug = df_proc.loc[drug, '%_hadm_receiving']
else:
score_drug = 0
score_ += score_drug
# average score in order to compare patients with different number of procedures
score_i = score_/number_procedure
score += score_i
final_score = score/len(df_test.index)
return final_score
print('Performance of most prescribed list of 10 procedures: ' +str(top10_performance(df_train, df_test)))
print('Performance of most prescribed list of 10 procedures with proportion: ' +str(top10_performance_prop(df_train, df_test)))
# recall at position 10
def pres_recommendation_perf(df_train, df_test, n_clusters):
# for each cluster, get the top 10 prescriptions with the proportion of admissions in the cluster receiving them
cluster_prescriptions_top_10 = {}
for c in range(0, n_clusters):
target=df_train[df_train['cluster'] == c]['new_prescriptions_grouped']
pres_list =[]
for x in target:
pres_list.extend(x)
df_new_drugs = pd.DataFrame(pres_list,columns=['new_prescriptions_grouped'])
df_cluster_pres = df_new_drugs['new_prescriptions_grouped'].value_counts().to_frame()
df_cluster_pres.columns = ['prevalence']
df_cluster_pres['rank'] = df_cluster_pres['prevalence'].rank(ascending=False) # dataframe with prescriptions as index and rank as column
cluster_top_10 = df_cluster_pres[df_cluster_pres['rank'] <= 10].index.to_list()
cluster_prescriptions_top_10["cluster_" + str(c)] = cluster_top_10
def intersection(lst1, lst2):
return list(set(lst1) & set(lst2))
# calculate the recall at position 10 value (test patient's prescriptions compared to the top 10 prescriptions in the most similar cluster)
overall_perf_ = 0
for i in df_test.index:
cluster = df_test.loc[i, 'cluster']
top_10_cluster = cluster_prescriptions_top_10["cluster_" + str(cluster)]
prescriptions = df_test.loc[i, 'new_prescriptions_grouped']
recommended_pres = intersection(prescriptions, top_10_cluster)
performance = len(recommended_pres)/len(prescriptions)
overall_perf_ += performance
overall_perf = overall_perf_/len(df_test.index)
return overall_perf
# proportion score
def pres_recommendation_score_prop(df_train, df_test, n_clusters):
# for each cluster, get the top 10 prescriptions with the proportion of admissions in the cluster receiving them
cluster_prescriptions_top_10_prop = {}
for c in range(0, n_clusters):
target=df_train[df_train['cluster'] == c]['new_prescriptions_grouped']
pres_list =[]
for x in target:
pres_list.extend(x)
number_pres_tot = len(pres_list)
df_new_drugs = pd.DataFrame(pres_list,columns=['new_prescriptions_grouped'])
df_cluster_pres = df_new_drugs['new_prescriptions_grouped'].value_counts().to_frame()
df_cluster_pres.columns = ['prevalence']
df_cluster_pres['proportion'] = df_cluster_pres['prevalence']/number_pres_tot
df_cluster_pres['%_hadm_receiving'] = df_cluster_pres['prevalence']/len(df_train[df_train['cluster'] == c].index)
df_cluster_pres['rank'] = df_cluster_pres['prevalence'].rank(ascending=False) # dataframe with prescriptions as index and rank as column
cluster_top_10_prop = df_cluster_pres[df_cluster_pres['rank'] <= 10]
cluster_prescriptions_top_10_prop["cluster_" + str(c)] = cluster_top_10_prop
# calculate the proportion score (test patient's prescriptions compared to the top 10 prescriptions in the most similar cluster)
score = 0
for i in df_test.index:
cluster = df_test.loc[i, 'cluster']
top_10_cluster = cluster_prescriptions_top_10_prop["cluster_" + str(cluster)].index.to_list()
score_ = 0
number_prescription = len(df_test.loc[i, 'new_prescriptions_grouped'])
for drug in df_test.loc[i, 'new_prescriptions_grouped']:
if drug in top_10_cluster:
score_drug = cluster_prescriptions_top_10_prop["cluster_" + str(cluster)].loc[drug, '%_hadm_receiving']
else:
score_drug = 0
score_ += score_drug
# average score in order to compare patients with different number of prescriptions
score_i = score_/number_prescription
score += score_i
final_score = score/len(df_test.index)
return final_score
#--------------------------------------------------------------------------------------------------------------------
# select input features set
variables = ['HR', 'RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU', 'ESTIMATED_FIO2_2', 'ALB', 'ALP', 'ALT', 'APTT', 'AST', 'BAS', 'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP', 'COCA', 'EGFR', 'EOS',
'GLU', 'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM', 'MCH', 'MCHC', 'MCV', 'METHB', 'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD', 'TBIL', 'TROP', 'UR', 'WBC', 'age',
'cci', 'post_firstOP_time_hours', 'post_lastOP_time_hours', 'ALB_delta', 'ALP_delta', 'ALT_delta', 'APTT_delta', 'AST_delta', 'BAS_delta', 'BASEX_delta', 'BICAR_delta', 'CAL_delta',
'CL_delta', 'CR_delta', 'CRP_delta', 'COCA_delta', 'EGFR_delta', 'EOS_delta', 'GLU_delta', 'HCT_delta', 'HGB_delta', 'INR_delta', 'KET_delta', 'LAC_delta', 'LYM_delta', 'MCH_delta',
'MCHC_delta', 'MCV_delta', 'METHB_delta', 'MON_delta', 'MPV_delta', 'NEU_delta', 'OSM_delta', 'PH_delta', 'PLT_delta', 'POT_delta', 'RBC_delta', 'SOD_delta', 'TBIL_delta',
'TROP_delta', 'UR_delta', 'WBC_delta', 'HR_minmax', 'HR_delta', 'RR_minmax', 'RR_delta', 'DBP_minmax', 'DBP_delta', 'SBP_minmax', 'SBP_delta', 'SPO2_minmax', 'SPO2_delta',
'TEMP_minmax', 'TEMP_delta', 'AVPU_minmax', 'AVPU_delta', 'ESTIMATED_FIO2_2_minmax', 'ESTIMATED_FIO2_2_delta']
# (paste autoencoder parameters according to input features set selected)
n_neurons = 128
n_layers = 2
latent_dimension = 42
n_components = 0.8 # number of components selected to explain > 80% of the variance
# select optimal cluster number for each combination according to the cluster number optimisation results
n_cluster_PCA_kmeans = 6
n_cluster_AE_kmeans = 4
n_cluster_PCA_hierarchical = 3
n_cluster_AE_hierarchical = 4
# set random seed
np.random.seed(1301)
random.set_seed(2206)
# data preparation
X = df_train[variables]
X_test = df_test[variables]
X_unscaled = X.to_numpy()
X_scaled = StandardScaler().fit_transform(X_unscaled)
X_test_unscaled = X_test.to_numpy()
X_test_scaled = StandardScaler().fit_transform(X_test_unscaled)
#--------------------------------------------------------------------------------------------------------------------
# PCA + k-means
# dimensionality reduction
pca = PCA(n_components=n_components, svd_solver='full')
X_latent = pca.fit_transform(X_scaled)
X_test_latent = pca.transform(X_test_scaled)
print('number of components: ' + str(pca.n_components_))
# define the k-means model
kmeansModel = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100, n_clusters=n_cluster_PCA_kmeans, n_init=10,
random_state=2206, tol=0.0001, verbose=0)
# fit the model and asign contact points (in both the train and test sets) to clusters
results_train = kmeansModel.fit_predict(X_latent)
results_test = kmeansModel.predict(X_test_latent)
df_train.loc[:,'cluster'] = results_train.tolist()
df_test.loc[:,'cluster'] = results_test.tolist()
# assess performance
perf_PCA_kmeans = pres_recommendation_perf(df_train, df_test, n_cluster_PCA_kmeans)
score_prop_PCA_kmeans = pres_recommendation_score_prop(df_train, df_test, n_cluster_PCA_kmeans)
print('++++++++++++++++++')
print('Recall PCA + k-means: ' + str(perf_PCA_kmeans))
print('Score prop PCA + k-means: ' + str(score_prop_PCA_kmeans))
#--------------------------------------------------------------------------------------------------------------------
# PCA + hierarchical clustering
# dimensionality reduction
pca = PCA(n_components=n_components, svd_solver='full')
X_latent = pca.fit_transform(X_scaled)
X_test_latent = pca.transform(X_test_scaled)
print('number of components: ' + str(pca.n_components_))
# define the hierarchical clustering model
h_clust = AgglomerativeClustering(n_clusters = n_cluster_PCA_hierarchical, affinity = 'euclidean', linkage ='ward')
# fit the model and asign contact points of the training set to clusters
results_train = h_clust.fit_predict(X_latent)
df_train.loc[:,'cluster'] = results_train.tolist()
# attribute each contact point of the test set to the closest cluster using the average method
for i in df_test.index:
ave_cluster_dist = []
for cluster in range(0, n_cluster_PCA_hierarchical):
average_dist = np.mean(np.square(X_latent[df_train[df_train['cluster']==cluster].index] - X_test_latent[i]).sum(axis=1))
ave_cluster_dist.append(average_dist)
target_cluster = np.argmin(ave_cluster_dist)
df_test.loc[i,'cluster'] = target_cluster
# assess performance
perf_PCA_hierarchical = pres_recommendation_perf(df_train, df_test, n_cluster_PCA_hierarchical)
score_prop_PCA_hierarchical = pres_recommendation_score_prop(df_train, df_test, n_cluster_PCA_hierarchical)
print('++++++++++++++++++')
print('Recall PCA + hierarchical: ' + str(perf_PCA_hierarchical))
print('Score prop PCA + hierarchical: ' + str(score_prop_PCA_hierarchical))
#--------------------------------------------------------------------------------------------------------------------
# AE + k-means
# dimensionality reduction
dimension = len(variables)
batch = int(len(X.index)/2)
input_data = keras.Input(shape=(dimension,))
encoded = layers.Dense(n_neurons, activation='relu')(input_data)
for encoder_layer in range(1, n_layers):
encoded = layers.Dense(n_neurons, activation='relu')(encoded)
encoded = layers.Dense(latent_dimension, activation='relu')(encoded)
decoded = layers.Dense(n_neurons, activation='relu')(encoded)
for decoder_layer in range(1, n_layers):
decoded = layers.Dense(n_neurons, activation='relu')(decoded)
decoded = layers.Dense(dimension)(decoded)
autoencoder = keras.Model(input_data, decoded)
encoder = keras.Model(input_data, encoded)
autoencoder.compile(optimizer='adam', loss='mean_absolute_error')
autoencoder.fit(X_scaled, X_scaled, epochs=200, batch_size=batch, shuffle=True, verbose = 0)
X_latent = encoder.predict(X_scaled)
X_test_latent = encoder.predict(X_test_scaled)
# define the k-means model
kmeansModel = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100, n_clusters=n_cluster_AE_kmeans, n_init=10,
random_state=2206, tol=0.0001, verbose=0)
# fit the model and asign contact points (in both the train and test sets) to clusters
results_train = kmeansModel.fit_predict(X_latent)
results_test = kmeansModel.predict(X_test_latent)
df_train.loc[:,'cluster'] = results_train.tolist()
df_test.loc[:,'cluster'] = results_test.tolist()
# assess performance
perf_AE_kmeans = pres_recommendation_perf(df_train, df_test, n_cluster_AE_kmeans)
score_prop_AE_kmeans = pres_recommendation_score_prop(df_train, df_test, n_cluster_AE_kmeans)
print('++++++++++++++++++')
print('Recall AE + k-means: ' + str(perf_AE_kmeans))
print('Score prop AE + k-means: ' + str(score_prop_AE_kmeans))
#--------------------------------------------------------------------------------------------------------------------
# AE + hierarchical clustering
# dimensionality reduction
dimension = len(variables)
batch = int(len(X.index)/2)
input_data = keras.Input(shape=(dimension,))
encoded = layers.Dense(n_neurons, activation='relu')(input_data)
for encoder_layer in range(1, n_layers):
encoded = layers.Dense(n_neurons, activation='relu')(encoded)
encoded = layers.Dense(latent_dimension, activation='relu')(encoded)
decoded = layers.Dense(n_neurons, activation='relu')(encoded)
for decoder_layer in range(1, n_layers):
decoded = layers.Dense(n_neurons, activation='relu')(decoded)
decoded = layers.Dense(dimension)(decoded)
autoencoder = keras.Model(input_data, decoded)
encoder = keras.Model(input_data, encoded)
autoencoder.compile(optimizer='adam', loss='mean_absolute_error')
autoencoder.fit(X_scaled, X_scaled, epochs=100, batch_size=batch, shuffle=True, verbose = 0)
X_latent = encoder.predict(X_scaled)
X_test_latent = encoder.predict(X_test_scaled)
# define hierarchical clustering model
h_clust = AgglomerativeClustering(n_clusters = n_cluster_AE_hierarchical, affinity = 'euclidean', linkage ='ward')
# fit the model and asign contact points of the training set to clusters
results_train = h_clust.fit_predict(X_latent)
df_train.loc[:,'cluster'] = results_train.tolist()
# attribute each contact point of the test set to the closest cluster using the average method
for i in df_test.index:
ave_cluster_dist = []
for cluster in range(0, n_cluster_AE_hierarchical):
average_dist = np.mean(np.square(X_latent[df_train[df_train['cluster']==cluster].index] - X_test_latent[i]).sum(axis=1))
ave_cluster_dist.append(average_dist)
target_cluster = np.argmin(ave_cluster_dist)
df_test.loc[i,'cluster'] = target_cluster
# assess performance
perf_AE_hierarchical = pres_recommendation_perf(df_train, df_test, n_cluster_AE_hierarchical)
score_prop_AE_hierarchical = pres_recommendation_score_prop(df_train, df_test, n_cluster_AE_hierarchical)
print('++++++++++++++++++')
print('Performance AE + hierarchical: ' + str(perf_AE_hierarchical))
print('Score prop AE + hierarchical: ' + str(score_prop_AE_hierarchical))
# dysplay the top ten prescriptions in each of the n clusters given by the best performing model combination.
# calculate in each of these same clusters the proprtion of the admissions which will experience a critical event within 24h of the contact point.
print('++++++++++++++++++')
print('best clusters') # in this case, 4 clusters produced by a combination of autoencoder and hierarchical clustering
for n in range(0, n_cluster_AE_hierarchical):
cluster_ = df_train[df_train['cluster']==n]['new_prescriptions_grouped']
print('size cluster: ' + str(len(cluster_)))
risk = len(df_train[(df_train['cluster']==n) & (df_train['event']==1)])/len(df_train[df_train['cluster']==n])
print('risk: '+ str(risk))
pres_list =[]
for x in cluster_:
pres_list.extend(x)
df_new_drugs = pd.DataFrame(pres_list,columns=['new_prescriptions_grouped'])
df_pres = df_new_drugs['new_prescriptions_grouped'].value_counts().to_frame()
df_pres.columns = ['prevalence']
df_pres['%_hadm_receiving'] = df_pres['prevalence']/len(cluster_.index)
df_pres['rank'] = df_pres['prevalence'].rank(ascending=False) # dataframe with prescriptions as index and rank as column
print(df_pres[df_pres['rank'] <= 10])
# recall at position 10
def proc_recommendation_perf(df_train, df_test, n_clusters):
# for each cluster, get the top 10 procedures with the proportion of admissions in the cluster receiving them
cluster_procedures_top_10 = {}
for c in range(0, n_clusters):
target=df_train[df_train['cluster'] == c]['new_procedures_grouped']
proc_list =[]
for x in target:
proc_list.extend(x)
df_new_drugs = pd.DataFrame(proc_list,columns=['new_procedures_grouped'])
df_cluster_proc = df_new_drugs['new_procedures_grouped'].value_counts().to_frame()
df_cluster_proc.columns = ['prevalence']
df_cluster_proc['rank'] = df_cluster_proc['prevalence'].rank(ascending=False) # dataframe with procedures as index and rank as column
cluster_top_10 = df_cluster_proc[df_cluster_proc['rank'] <= 10].index.to_list()
cluster_procedures_top_10["cluster_" + str(c)] = cluster_top_10
def intersection(lst1, lst2):
return list(set(lst1) & set(lst2))
# calculate the recall at position 10 value (test patient's procedures compared to the top 10 procedures in the most similar cluster)
overall_perf_ = 0
for i in df_test.index:
cluster = df_test.loc[i, 'cluster']
top_10_cluster = cluster_procedures_top_10["cluster_" + str(cluster)]
procedures = df_test.loc[i, 'new_procedures_grouped']
recommended_proc = intersection(procedures, top_10_cluster)
performance = len(recommended_proc)/len(procedures)
overall_perf_ += performance
overall_perf = overall_perf_/len(df_test.index)
return overall_perf
# score proportion
def proc_recommendation_score_prop(df_train, df_test, n_clusters):
# for each cluster, get the top 10 procedures with the proportion of admissions in the cluster receiving them
cluster_procedures_top_10_prop = {}
for c in range(0, n_clusters):
target=df_train[df_train['cluster'] == c]['new_procedures_grouped']
proc_list =[]
for x in target:
proc_list.extend(x)
number_proc_tot = len(proc_list)
df_new_drugs = pd.DataFrame(proc_list,columns=['new_procedures_grouped'])
df_cluster_proc = df_new_drugs['new_procedures_grouped'].value_counts().to_frame()
df_cluster_proc.columns = ['prevalence']
df_cluster_proc['proportion'] = df_cluster_proc['prevalence']/number_proc_tot
df_cluster_proc['%_hadm_receiving'] = df_cluster_proc['prevalence']/len(df_train[df_train['cluster'] == c].index)
df_cluster_proc['rank'] = df_cluster_proc['prevalence'].rank(ascending=False) # dataframe with procedures as index and rank as column
cluster_top_10_prop = df_cluster_proc[df_cluster_proc['rank'] <= 10]
cluster_procedures_top_10_prop["cluster_" + str(c)] = cluster_top_10_prop
# calculate the proportion score (test patient's procedures compared to the top 10 procedures in the most similar cluster)
score = 0
for i in df_test.index:
cluster = df_test.loc[i, 'cluster']
top_10_cluster = cluster_procedures_top_10_prop["cluster_" + str(cluster)].index.to_list()
score_ = 0
number_procedure = len(df_test.loc[i, 'new_procedures_grouped'])
for drug in df_test.loc[i, 'new_procedures_grouped']:
if drug in top_10_cluster:
score_drug = cluster_procedures_top_10_prop["cluster_" + str(cluster)].loc[drug, '%_hadm_receiving']
else:
score_drug = 0
score_ += score_drug
# average score in order to compare patients with different number of procedures
score_i = score_/number_procedure
score += score_i
final_score = score/len(df_test.index)
return final_score
#--------------------------------------------------------------------------------------------------------------------
# select input features set
variables = ['HR', 'RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU', 'ESTIMATED_FIO2_2', 'ALB', 'ALP', 'ALT', 'APTT', 'AST', 'BAS', 'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP', 'COCA', 'EGFR', 'EOS',
'GLU', 'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM', 'MCH', 'MCHC', 'MCV', 'METHB', 'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD', 'TBIL', 'TROP', 'UR', 'WBC', 'age',
'cci', 'post_firstOP_time_hours', 'post_lastOP_time_hours', 'ALB_delta', 'ALP_delta', 'ALT_delta', 'APTT_delta', 'AST_delta', 'BAS_delta', 'BASEX_delta', 'BICAR_delta', 'CAL_delta',
'CL_delta', 'CR_delta', 'CRP_delta', 'COCA_delta', 'EGFR_delta', 'EOS_delta', 'GLU_delta', 'HCT_delta', 'HGB_delta', 'INR_delta', 'KET_delta', 'LAC_delta', 'LYM_delta', 'MCH_delta',
'MCHC_delta', 'MCV_delta', 'METHB_delta', 'MON_delta', 'MPV_delta', 'NEU_delta', 'OSM_delta', 'PH_delta', 'PLT_delta', 'POT_delta', 'RBC_delta', 'SOD_delta', 'TBIL_delta',
'TROP_delta', 'UR_delta', 'WBC_delta', 'HR_minmax', 'HR_delta', 'RR_minmax', 'RR_delta', 'DBP_minmax', 'DBP_delta', 'SBP_minmax', 'SBP_delta', 'SPO2_minmax', 'SPO2_delta',
'TEMP_minmax', 'TEMP_delta', 'AVPU_minmax', 'AVPU_delta', 'ESTIMATED_FIO2_2_minmax', 'ESTIMATED_FIO2_2_delta']
# (paste autoencoder parameters according to input features set selected)
n_neurons = 128
n_layers = 2
latent_dimension = 42
n_components = 0.8 # number of components selected to explain > 80% of the variance
# select optimal cluster number for each combination according to the cluster number optimisation results
n_cluster_PCA_kmeans = 6
n_cluster_AE_kmeans = 4
n_cluster_PCA_hierarchical = 3
n_cluster_AE_hierarchical = 4
# set random seed
np.random.seed(1301)
random.set_seed(2206)
# data preparation
X = df_train[variables]
X_test = df_test[variables]
X_unscaled = X.to_numpy()
X_scaled = StandardScaler().fit_transform(X_unscaled)
X_test_unscaled = X_test.to_numpy()
X_test_scaled = StandardScaler().fit_transform(X_test_unscaled)
#--------------------------------------------------------------------------------------------------------------------
# PCA + k-means
# dimensionality reduction
pca = PCA(n_components=n_components, svd_solver='full')
X_latent = pca.fit_transform(X_scaled)
X_test_latent = pca.transform(X_test_scaled)
# define k-means model
kmeansModel = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100, n_clusters=n_cluster_PCA_kmeans, n_init=10,
random_state=2206, tol=0.0001, verbose=0)
# fit the model and asign contact points (in both the train and test sets) to clusters
results_train = kmeansModel.fit_predict(X_latent)
results_test = kmeansModel.predict(X_test_latent)
df_train.loc[:,'cluster'] = results_train.tolist()
df_test.loc[:,'cluster'] = results_test.tolist()
# assess performance
perf_PCA_kmeans = proc_recommendation_perf(df_train, df_test, n_cluster_PCA_kmeans)
score_prop_PCA_kmeans = proc_recommendation_score_prop(df_train, df_test, n_cluster_PCA_kmeans)
print('++++++++++++++++++')
print('Performance PCA + k-means: ' + str(perf_PCA_kmeans))
print('Score prop PCA + k-means: ' + str(score_prop_PCA_kmeans))
#--------------------------------------------------------------------------------------------------------------------
# PCA + hierarchical clustering
# dimensionality reduction
pca = PCA(n_components=n_components, svd_solver='full')
X_latent = pca.fit_transform(X_scaled)
X_test_latent = pca.transform(X_test_scaled)
# define hierarchical clustering model
h_clust = AgglomerativeClustering(n_clusters = n_cluster_PCA_hierarchical, affinity = 'euclidean', linkage ='ward')
# fit the model and asign contact points of the training set to clusters
results_train = h_clust.fit_predict(X_latent)
df_train.loc[:,'cluster'] = results_train.tolist()
# attribute each contact point of the test set to the closest cluster using the average method
for i in df_test.index:
ave_cluster_dist = []
for cluster in range(0, n_cluster_PCA_hierarchical):
average_dist = np.mean(np.square(X_latent[df_train[df_train['cluster']==cluster].index] - X_test_latent[i]).sum(axis=1))
ave_cluster_dist.append(average_dist)
target_cluster = np.argmin(ave_cluster_dist)
df_test.loc[i,'cluster'] = target_cluster
# assess performance
perf_PCA_hierarchical = proc_recommendation_perf(df_train, df_test, n_cluster_PCA_hierarchical)
score_prop_PCA_hierarchical = proc_recommendation_score_prop(df_train, df_test, n_cluster_PCA_hierarchical)
print('++++++++++++++++++')
print('Performance PCA + hierarchical: ' + str(perf_PCA_hierarchical))
print('Score prop PCA + hierarchical: ' + str(score_prop_PCA_hierarchical))
#--------------------------------------------------------------------------------------------------------------------
# AE + k-means
# dimensionality reduction
dimension = len(variables)
batch = int(len(X.index)/2)
input_data = keras.Input(shape=(dimension,))
encoded = layers.Dense(n_neurons, activation='relu')(input_data)
for encoder_layer in range(1, n_layers):
encoded = layers.Dense(n_neurons, activation='relu')(encoded)
encoded = layers.Dense(latent_dimension, activation='relu')(encoded)
decoded = layers.Dense(n_neurons, activation='relu')(encoded)
for decoder_layer in range(1, n_layers):
decoded = layers.Dense(n_neurons, activation='relu')(decoded)
decoded = layers.Dense(dimension)(decoded)
autoencoder = keras.Model(input_data, decoded)
encoder = keras.Model(input_data, encoded)
autoencoder.compile(optimizer='adam', loss='mean_absolute_error')
autoencoder.fit(X_scaled, X_scaled, epochs=200, batch_size=batch, shuffle=True, verbose = 0)
X_latent = encoder.predict(X_scaled)
X_test_latent = encoder.predict(X_test_scaled)
# define k-means model
kmeansModel = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100, n_clusters=n_cluster_AE_kmeans, n_init=10,
random_state=2206, tol=0.0001, verbose=0)
# fit the model and asign contact points (in both the train and test sets) to clusters
results_train = kmeansModel.fit_predict(X_latent)
results_test = kmeansModel.predict(X_test_latent)
df_train.loc[:,'cluster'] = results_train.tolist()
df_test.loc[:,'cluster'] = results_test.tolist()
# assess performance
perf_AE_kmeans = proc_recommendation_perf(df_train, df_test, n_cluster_AE_kmeans)
score_prop_AE_kmeans = proc_recommendation_score_prop(df_train, df_test, n_cluster_AE_kmeans)
print('++++++++++++++++++')
print('Performance AE + k-means: ' + str(perf_AE_kmeans))
print('Score prop AE + k-means: ' + str(score_prop_AE_kmeans))
#--------------------------------------------------------------------------------------------------------------------
# AE + hierarchical clustering
# dimensionality reduction
dimension = len(variables)
batch = int(len(X.index)/2)
input_data = keras.Input(shape=(dimension,))
encoded = layers.Dense(n_neurons, activation='relu')(input_data)
for encoder_layer in range(1, n_layers):
encoded = layers.Dense(n_neurons, activation='relu')(encoded)
encoded = layers.Dense(latent_dimension, activation='relu')(encoded)
decoded = layers.Dense(n_neurons, activation='relu')(encoded)
for decoder_layer in range(1, n_layers):
decoded = layers.Dense(n_neurons, activation='relu')(decoded)
decoded = layers.Dense(dimension)(decoded)
autoencoder = keras.Model(input_data, decoded)
encoder = keras.Model(input_data, encoded)
autoencoder.compile(optimizer='adam', loss='mean_absolute_error')
autoencoder.fit(X_scaled, X_scaled, epochs=100, batch_size=batch, shuffle=True, verbose = 0)
X_latent = encoder.predict(X_scaled)
X_test_latent = encoder.predict(X_test_scaled)
# define hierarchical clustering model
h_clust = AgglomerativeClustering(n_clusters = n_cluster_AE_hierarchical, affinity = 'euclidean', linkage ='ward')
# fit the model and asign contact points of the training set to clusters
results_train = h_clust.fit_predict(X_latent)
df_train.loc[:,'cluster'] = results_train.tolist()
# attribute each contact point of the test set to the closest cluster using the average method
for i in df_test.index:
ave_cluster_dist = []
for cluster in range(0, n_cluster_AE_hierarchical):
average_dist = np.mean(np.square(X_latent[df_train[df_train['cluster']==cluster].index] - X_test_latent[i]).sum(axis=1))
ave_cluster_dist.append(average_dist)
target_cluster = np.argmin(ave_cluster_dist)
df_test.loc[i,'cluster'] = target_cluster
# assess performance
perf_AE_hierarchical = proc_recommendation_perf(df_train, df_test, n_cluster_AE_hierarchical)
score_prop_AE_hierarchical = proc_recommendation_score_prop(df_train, df_test, n_cluster_AE_hierarchical)
print('++++++++++++++++++')
print('Performance AE + hierarchical: ' + str(perf_AE_hierarchical))
print('Score prop AE + hierarchical: ' + str(score_prop_AE_hierarchical))
Requires df_clustering_ready and df_raw
dataset characteristics
# select postoperative contact points on the ward (including recovery time)
df_fig4 = df_graphs_all[df_graphs_all['status'].isin([2, 6])]
# plot figure 4
pdf = plt.figure(figsize=(12,17))
plt.suptitle(t = pop.description, fontsize = 20, y = 0.93)
Fig1 = plt.subplot(321)
df_fig4[df_fig4.alarm == 1]['score_total'].plot(kind='hist', bins = np.arange(2.5, 15.5, 1), color = 'darkorange')
plt.axvline(x=6.5, color='black', linestyle='-')
Fig1.set_title('Distribution of NEWS values', fontsize = 14, pad = 10)
Fig1.set_xlabel('NEWS values', fontsize = 12)
Fig1.set_ylabel('Frequency', fontsize = 12)
Fig2 = plt.subplot(322)
df_fig4[df_fig4.alarm == 1]['hadm_id'].value_counts().plot(kind='hist', bins=np.arange(0.5, 101.5, 1))
Fig2.set_title('Number of triggers per admission', fontsize = 14, pad = 10)
Fig2.set_xlabel('Number of triggers', fontsize = 12)
Fig2.set_ylabel('Frequency', fontsize = 12)
Fig3 = plt.subplot(323)
df_fig4['post_firstOP_time_h'] = df_fig4['post_firstOP_time'] / np.timedelta64(1, 'h')
df_fig4[df_fig4.alarm == 1]['post_firstOP_time_h'].plot(kind='hist', bins=100, range=(0,400), color = 'crimson')
Fig3.set_title('Distribution of NEWS triggers over time', fontsize = 14, pad = 10)
Fig3.set_xlabel('Post operative time (hours)', fontsize = 12)
Fig3.set_ylabel('Frequency', fontsize = 12)
Fig4 = plt.subplot(324)
df_fig4['post_firstOP_time_h'] = df_fig4['post_firstOP_time'] / np.timedelta64(1, 'h')
df_fig4[(df_fig4['alarm'] == 1)].drop_duplicates(subset=['hadm_id'], keep='first')['post_firstOP_time_h'].plot(kind='hist', bins=100, range=(0,200), color = 'darkcyan')
Fig4.set_title('Distribution of first alarm triggers over time', fontsize = 14, pad = 10)
Fig4.set_xlabel('Post operative time (hours)', fontsize = 12)
Fig4.set_ylabel('Frequency', fontsize = 12)
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.2, hspace=0.3)
plt.show()
pdf.savefig('C:/Users/baptiste.vasey/Documents/Thesis/Figures/Fig4_' + pop.name + '.pdf', bbox_inches = 'tight')
description of the intervals between the timepoints of input data collection
# select observations and lab values during a postoperative stay on the ward (without recovery period)
df_fig5=df_raw[df_raw['status'] == 2].copy()
# select information to display: 'vitals' for only the SEND entries, 'labs' for only the laboratory entries, 'POCT' for only POCT entries, 'all' for all combined
record = 'vitals'
# select the time of the day to consider: 'all_day' for all day, 'period' for only a selected period of time. If period is selected, enter the start and end time
time = 'all_day'
start = '20:00'
end = '07:00'
# select data accordingly
if record == 'vitals':
df_rec = df_fig5.dropna(subset = ['charttime']).copy(deep=True)
title1 = 'vital signs'
elif record == 'labs':
df_rec = df_fig5.dropna(subset = ['labtime']).copy(deep=True)
title1 = 'labs'
elif record == 'POCT':
df_rec = df_fig5.dropna(subset = ['collection_time']).copy(deep=True)
title1 = 'POCT'
elif record == 'all':
df_rec = df_fig5.copy(deep=True)
title1 = 'vital signs, labs and POCT'
if time == 'period':
df_time = df_rec.copy(deep=True)
df_time = df_time.set_index(pd.DatetimeIndex(df_time['obstime']))
df_time = df_time.between_time(start, end)
title2 = '\n (without recording from ' + end + ' to ' + start + ')'
elif time == 'all_day':
df_time = df_rec.copy(deep=True)
title2 = ''
# time unit adjustments
df_time['interval_s']=df_time['interval']/datetime.timedelta(seconds=1)
df_time['interval_h']=df_time['interval']/datetime.timedelta(hours=1)
df_time['time_since_hadm_s']=df_time['time_since_hadm']/datetime.timedelta(seconds=1)
df_time['time_since_hadm_h']=df_time['time_since_hadm']/datetime.timedelta(hours=1)
# plot figure 5
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
# interval between 2 recordings
df_time['interval_h'].plot(ax=ax1, kind='hist', bins = range(0,36), fontsize = 14) # use range(0,36) for vitals and range(0,72) for labs
ax1.set_title('Distribution of the intervals between 2 ' + title1 + '\nrecording over time', fontsize = 14, pad = 10)
ax1.set_xlabel('interval between two recording (h)', fontsize = 12)
ax1.set_ylabel('number of recording', fontsize = 12)
ax1.tick_params(labelsize=10)
# (for labs graph) ax1.axvline(x=26, color='black', linestyle='--')
# evolution of the interval between 2 recordings according to the patient length of stay (boxplot)
df_time['stay_group'] = pd.cut(df_time['time_since_hadm_h'], bins=range(0, 2501, 120))
df_time.boxplot(ax=ax2, column='interval_h', by='stay_group', grid=False, whis=[5, 95], showfliers=False)
ax2.set_xticklabels([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20])
ax2.set_title('Evolution of intervals between 2 ' + title1 + '\nrecording over time', fontsize = 14, pad = 10)
ax2.set_xlabel('length of stay (by group of 5 days)', fontsize = 12)
ax2.set_ylabel('interval between 2 recordings (h)', fontsize = 12)
plt.suptitle('')
# (for labs graph) ax2.axhline(y=26, color='black', linestyle='--')
# adapt file name according to data used
fig.savefig('C:/Users/baptiste.vasey/Documents/Thesis/Figures/Fig2_vitals_' + pop.name + '.pdf', bbox_inches = 'tight')
distribution of the number of new prescription within 24 hours according to the occurence of NEWS alarms and critical events following a contact point
# select postoperative contact points on the ward (without recovery period)
df_fig6 = df_graphs_all[df_graphs_all['status']==2]
# produce a dataframe for each of the combinations of interest
s1 = pd.DataFrame(df_fig6[(df_fig6['event'] == 0)]['number_prescriptions_groups'])
s2 = pd.DataFrame(df_fig6[(df_fig6['event'] == 1)]['number_prescriptions_groups'])
s3 = pd.DataFrame(df_fig6[(df_fig6['alarm'] == 0)]['number_prescriptions_groups'])
s4 = pd.DataFrame(df_fig6[(df_fig6['alarm'] == 1)]['number_prescriptions_groups'])
s5 = pd.DataFrame(df_fig6[(df_fig6['event'] == 0) & (df_fig6['alarm'] == 0)]['number_prescriptions_groups'])
s6 = pd.DataFrame(df_fig6[(df_fig6['event'] == 0) & (df_fig6['alarm'] == 1)]['number_prescriptions_groups'])
s7 = pd.DataFrame(df_fig6[(df_fig6['event'] == 1) & (df_fig6['alarm'] == 0)]['number_prescriptions_groups'])
s8 = pd.DataFrame(df_fig6[(df_fig6['event'] == 1) & (df_fig6['alarm'] == 1)]['number_prescriptions_groups'])
s1['group'] = '1.No_event'
s2['group'] = '2.Event'
s3['group'] = '3.No_alarm'
s4['group'] = '4.Alarm'
s5['group'] = '5.No_alarm_no_event'
s6['group'] = '6.Alarm_no_event'
s7['group'] = '7.No_alarm_event'
s8['group'] = '8.Alarm_and_event'
# used to adapt legend
print(len(s1.index))
print(len(s2.index))
print(len(s3.index))
print(len(s4.index))
print(len(s5.index))
print(len(s6.index))
print(len(s7.index))
print(len(s8.index))
# plot figure 6
df_plot = pd.concat([s1, s2, s3, s4, s5, s6, s7, s8])
ax=df_plot.boxplot(by='group', grid=False, whis=[5, 95], showfliers=False)
ax.set_xticklabels(['No event\n(187,330)', 'Event\n(4052)', 'No alarm\n(170,370)', 'Alarm\n(21,012)', 'No alarm, no event\n(167,716)', 'Alarm, no event\n(19,614)', 'No alarm, event\n(2654)', 'Alarm and event\n(1398)'], rotation=70)
ax.set_title('Number of new prescriptions within 24h according\nto the alarm and event status', fontsize = 14, pad = 10)
ax.set_xlabel('Presence/abscence of alarm and event', fontsize = 12)
ax.set_ylabel('Number of new prescriptions\nwithin 24h', fontsize = 12)
plt.suptitle('')
plt.savefig('C:/Users/baptiste.vasey/Documents/Thesis/Figures/Fig6_' + pop.name + '.pdf', bbox_inches = 'tight')
plt.show()
cluster number optimisation for the k-means algorithm using the custom proportion score
# choose optimisation score and input features set of interest
df_res_PCA_kmeans = pd.read_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreProp_PCA_kmeans_first_all.csv', index_col=[0])
df_res_AE_kmeans = pd.read_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreProp_AE_kmeans_first_all.csv', index_col=[0])
# paste according latent dimension and n_components variables
latent_dimension = [21, 42, 63, 84, 105]
n_components = [0.75, 0.8, 0.85, 0.9, 0.95]
# plot figure 7
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(14,6))
fig.subplots_adjust(hspace=0.35, wspace=0.25)
axs=axs.ravel()
axs[0].plot(df_res_PCA_kmeans[['PCA_kmeans_0', 'PCA_kmeans_1', 'PCA_kmeans_2', 'PCA_kmeans_3', 'PCA_kmeans_4']])
axs[0].set_title('PCA + kmeans:\nvalidation results', fontsize = 16, pad = 15)
axs[0].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[0].set_ylabel('Custom proportion score value', fontsize = 12, labelpad = 10)
axs[0].legend(['exvar: '+str(n_components[0]), 'exvar: '+str(n_components[1]), 'exvar: '+str(n_components[2]), 'exvar: '+str(n_components[3]), 'exvar: '+str(n_components[4])], loc = 'lower right')
axs[1].plot(df_res_AE_kmeans[['AE_kmeans_0', 'AE_kmeans_1', 'AE_kmeans_2', 'AE_kmeans_3', 'AE_kmeans_4']])
axs[1].set_title('Autoencoder + hierarchical clustering:\nvalidation results', fontsize = 16, pad = 15)
axs[1].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[1].set_ylabel('Custom proportion score value', fontsize = 12, labelpad = 10)
axs[1].legend(['latdim: '+str(latent_dimension[0]), 'latdim: '+str(latent_dimension[1]), 'latdim: '+str(latent_dimension[2]), 'latdim: '+str(latent_dimension[3]), 'latdim: '+str(latent_dimension[4])], loc = 'lower right')
# adapt to optimisation score and input features set selected
plt.savefig('C:/Users/baptiste.vasey/Documents/Thesis/Figures/Fig7_ScoreProp_all_' + pop.name + '.pdf', bbox_inches = 'tight')
plt.show()
Cluster number optimisation for the hierarchical clustering algorithm using the dendrogram method
# select input features set
variables = ['HR', 'RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU', 'ESTIMATED_FIO2_2', 'ALB', 'ALP', 'ALT', 'APTT', 'AST', 'BAS', 'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP', 'COCA', 'EGFR', 'EOS',
'GLU', 'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM', 'MCH', 'MCHC', 'MCV', 'METHB', 'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD', 'TBIL', 'TROP', 'UR', 'WBC', 'age',
'cci', 'post_firstOP_time_hours', 'post_lastOP_time_hours', 'ALB_delta', 'ALP_delta', 'ALT_delta', 'APTT_delta', 'AST_delta', 'BAS_delta', 'BASEX_delta', 'BICAR_delta', 'CAL_delta',
'CL_delta', 'CR_delta', 'CRP_delta', 'COCA_delta', 'EGFR_delta', 'EOS_delta', 'GLU_delta', 'HCT_delta', 'HGB_delta', 'INR_delta', 'KET_delta', 'LAC_delta', 'LYM_delta', 'MCH_delta',
'MCHC_delta', 'MCV_delta', 'METHB_delta', 'MON_delta', 'MPV_delta', 'NEU_delta', 'OSM_delta', 'PH_delta', 'PLT_delta', 'POT_delta', 'RBC_delta', 'SOD_delta', 'TBIL_delta',
'TROP_delta', 'UR_delta', 'WBC_delta', 'HR_minmax', 'HR_delta', 'RR_minmax', 'RR_delta', 'DBP_minmax', 'DBP_delta', 'SBP_minmax', 'SBP_delta', 'SPO2_minmax', 'SPO2_delta',
'TEMP_minmax', 'TEMP_delta', 'AVPU_minmax', 'AVPU_delta', 'ESTIMATED_FIO2_2_minmax', 'ESTIMATED_FIO2_2_delta']
X = df_train[variables]
# standardisation
X_unscaled = X.to_numpy()
X_scaled = StandardScaler().fit_transform(X_unscaled)
#PCA
pca = PCA(n_components=0.8, svd_solver='full')
X_PCA_latent = pca.fit_transform(X_scaled)
#AutoEncoder (paste parameters according to input features set selected)
dimension = len(variables)
batch = 32
n_neurons = 128
n_layers = 2
n_latdim = 42
input_data = keras.Input(shape=(dimension,))
encoded = layers.Dense(n_neurons, activation='relu')(input_data)
for encoder_layer in range(1, n_layers):
encoded = layers.Dense(n_neurons, activation='relu')(encoded)
encoded = layers.Dense(n_latdim, activation='relu')(encoded)
decoded = layers.Dense(n_neurons, activation='relu')(encoded)
for decoder_layer in range(1, n_layers):
decoded = layers.Dense(n_neurons, activation='relu')(decoded)
decoded = layers.Dense(dimension)(decoded)
autoencoder = keras.Model(input_data, decoded)
encoder = keras.Model(input_data, encoded)
autoencoder.compile(optimizer='adam', loss='mean_absolute_error')
autoencoder.fit(X_scaled, X_scaled, epochs=100, batch_size=batch, shuffle=True, verbose = 0)
X_AE_latent = encoder.predict(X_scaled)
# define the dendrogram plotting function (adapted from https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html)
def plot_dendrogram(model, **kwargs):
# create the counts of samples under each node
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack(
[model.children_, model.distances_, counts]
).astype(float)
# Plot the corresponding dendrogram
dendrogram(linkage_matrix, **kwargs)
# define and fit the hierarchical clustering models with different dimensionality reduction methods
model_PCA = AgglomerativeClustering(distance_threshold=0, n_clusters=None, affinity = 'euclidean', linkage ='ward')
model_PCA = model_PCA.fit(X_PCA_latent)
model_AE = AgglomerativeClustering(distance_threshold=0, n_clusters=None, affinity = 'euclidean', linkage ='ward')
model_AE = model_AE.fit(X_AE_latent)
# plot figure 8
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(14, 5))
fig.subplots_adjust(wspace=0.25)
plot_dendrogram(model_PCA, ax=ax1, truncate_mode="level", p=4, leaf_font_size=13, leaf_rotation=90, color_threshold = 60)
ax1.set_title("PCA + Hierarchical Clustering Dendrogram", fontsize = 16, pad = 15)
ax1.set_xlabel("Number of points in mid-level clusters (nodes)\n(or index of single point cluster if no parenthesis)", fontsize = 12, labelpad = 15)
ax1.set_ylabel("Distance between clusters\n(ward method)", fontsize = 12)
ax1.tick_params(labelsize=12)
plot_dendrogram(model_AE, ax=ax2, truncate_mode="level", p=4, leaf_font_size=12, leaf_rotation=90, color_threshold = 80)
ax2.set_title("AE + Hierarchical Clustering Dendrogram", fontsize = 16, pad = 15)
ax2.set_xlabel("Number of points in mid-level clusters (nodes)\n(or index of single point cluster if no parenthesis)", fontsize = 12, labelpad = 15)
ax2.set_ylabel("Distance between clusters\n(ward method)", fontsize = 12)
ax2.tick_params(labelsize=12)
# adapt name to input features set selected
plt.savefig('C:/Users/baptiste.vasey/Documents/Thesis/Figures/Fig8_hierarchical_all_' + pop.name + '.pdf', bbox_inches = 'tight')
plt.show()