Supplementary file IV.1

A patient similarity-based approach to improve the management of postoperative complications

Baptiste Vasey 24-07-2019 to 10-04-2023

Jupyter Notebook v6.0.0 Python v3.7.4 DBeaver v6.2.0 (SQL 8+)

Data source: Hospital Alerting Via Electronic Noticeboard (HAVEN) - Oxford University Hospitals NHS Trust

Dependency: vitals_labs_POCTS.sql (see suppl. file IV.2)

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Prelims

In [ ]:
import pymysql
import mysql.connector
import os
import sys
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import datetime
import random
import math
from collections import Counter
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import quantile_transform
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import GroupKFold
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering 
from sklearn.neighbors import kneighbors_graph 
from sklearn.neighbors import NearestCentroid
from sklearn import metrics
from scipy import linalg
from scipy.spatial.distance import cdist
from scipy.sparse.linalg import expm
from scipy.cluster import hierarchy
from sklearn.cluster import AgglomerativeClustering
import pickle
import itertools
import clusteval
import statistics
import keras
from keras import layers
from tensorflow import random

Data extraction and pre-processing

Target population

In [ ]:
class Population:
    description = "population description"
    string = "SQL command"
    name = "name for file"

# all inpatient surgical admissions between 01.01.2016 and 31.12.2017, whose first theatre visit had a treatment function code in (100, 104, 105, 106, 301, 306) + general surgery in OR description
pop4 = Population()
pop4.description = "All visceral surgery patient"
pop4.string = """
SELECT DISTINCT first_tv.hadm_id
FROM (
    SELECT A.*
    FROM havenouh_v15.theatre_visits as A
    INNER JOIN	(
        SELECT hadm_id, min(anaesthesia_time) as anaesthesia_time
        FROM havenouh_v15.theatre_visits
        GROUP BY hadm_id) as B 
    ON A.hadm_id = B.hadm_id
    AND A.anaesthesia_time = B.anaesthesia_time
    WHERE A.main_specialty = "General Surgery"
    ORDER BY A.hadm_id
    ) as first_tv
LEFT JOIN havenouh_v15.admissions as a
ON first_tv.hadm_id = a.hadm_id
LEFT JOIN havenouh_v15.consultants as c
ON first_tv.episode_id = c.episode_id
WHERE ((c.treatment_function_code in (100, 104, 105, 106, 301, 306)) OR (c.treatment_function_code = 180 AND c.specialty_code = 100))
AND (first_tv.main_specialty = 'General Surgery')
AND (to_days(a.admission_time)<>to_days(a.discharge_time))
AND (year(a.admission_time) BETWEEN 2016 and 2017)
ORDER BY first_tv.hadm_id
"""
pop4.name = "visceral_all"

print('populations ready')

Define data extraction function

In [ ]:
def extraction(population):

    # connect to database
    mydb = mysql.connector.connect(
      host="mysql8",
      user= os.getenv('USERNAMEH'),
      passwd= os.getenv('PASSWORDH'),
      database="havenouh_v15"
    )

    # import a database of vital signs, labs and POCT for all surgical admission with at least one visit to theatre, between 2016 and 2017
    query1 = """
    SELECT vlp.*, 
        co.subject_id, co.gender, co.age, co.hospital_code, co.treatment_function_code, co.admission_time, co.discharge_time, co.cci, co.weight, co.BMI,
        f.OP_start_time AS firstOP_start_time, f.OP_end_time AS firstOP_end_time
    FROM user_baptiste_vasey.vitals_labs_POCTs vlp
    LEFT JOIN user_baptiste_vasey.cohort co
    ON vlp.hadm_id = co.hadm_id
    LEFT JOIN user_baptiste_vasey.first_OP f
    ON vlp.hadm_id = f.hadm_id
    WHERE vlp.hadm_id IN (""" + population.string + """) 
    ORDER BY vlp.hadm_id, vlp.obstime;"""
    df_OP_inpat = pd.read_sql(query1, mydb)

    # import a database of theatre visits for the selected hospital admissions
    query2 = """
    SELECT hadm_id, start_time, end_time, theatre, main_specialty
    FROM havenouh_v15.theatre_visits
    WHERE hadm_id IN (""" + population.string + """)
    ORDER BY hadm_id;"""
    df_list_OP = pd.read_sql(query2, mydb)
    
    # set OP end time with the value of OP start time in the cases where the value is missing 
    df_list_OP.loc[df_list_OP['end_time'].isnull(),'end_time'] = df_list_OP[df_list_OP['end_time'].isnull()]['start_time']
    
    mydb.close()
    
    
    # create a column time_since_hadm giving the time delta since admission
    df_OP_inpat['time_since_hadm'] = df_OP_inpat['obstime'] - df_OP_inpat['admission_time']

    # first operation end time is claculated in SQl server: see df_OP_inpat['firstOP_end_time']
    # set first OP end time with the value of first OP start time in the cases where the value is missing 
    df_OP_inpat.loc[df_OP_inpat['firstOP_end_time'].isnull(),'firstOP_end_time'] = df_OP_inpat[df_OP_inpat['firstOP_end_time'].isnull()]['firstOP_start_time']

    # get last operation end time
    df_OP_inpat['lastOP_end_time'] = np.datetime64('NaT')

    def get_lastOP_end_time(row):
        hadm_id = row['hadm_id']
        obs_time = row['obstime']
        
        if obs_time >= row['firstOP_end_time']:
            last_op = df_list_OP[(df_list_OP['hadm_id'] == hadm_id) & (df_list_OP['end_time'] < obs_time)]['end_time'].max()
            row.loc['lastOP_end_time'] = last_op
        
        # so observations taken before the first operation also have a value for 'lastOP_end_time'
        else:
            row.loc['lastOP_end_time'] = row['firstOP_end_time']

        return row

    df_OP_inpat=df_OP_inpat.apply(lambda x: get_lastOP_end_time(x), axis=1)
    
    
    # create a column post_firstOP_time giving the time delta since the first operation
    df_OP_inpat.loc[:,'post_firstOP_time'] = df_OP_inpat['obstime'] - df_OP_inpat['firstOP_end_time']
    
    # transform the post first OP time into either deciles (to correct for markedly skewed data) or number of hours
    df_OP_inpat['post_firstOP_time_deciles'] = pd.qcut(df_OP_inpat['post_firstOP_time'], 10, labels = False, duplicates = 'drop') +1
    df_OP_inpat['post_firstOP_time_hours'] = df_OP_inpat['post_firstOP_time']/np.timedelta64(1, 'h')
    
    # create a column post_lastOP_time giving the time delta since the last operation
    df_OP_inpat.loc[:,'post_lastOP_time'] = df_OP_inpat['obstime'] - df_OP_inpat['lastOP_end_time']
    
    # transform the post last OP time into either deciles (to correct for markedly skewed data) or number of hours
    df_OP_inpat['post_lastOP_time_deciles'] = pd.qcut(df_OP_inpat['post_lastOP_time'], 10, labels = False, duplicates = 'drop') +1
    df_OP_inpat['post_lastOP_time_hours'] = df_OP_inpat['post_lastOP_time']/np.timedelta64(1, 'h')
    
    # create a column interval giving the time interval since the last recording of the same type (but not necessarily the same variable)
    df_OP_inpat.loc[df_OP_inpat['charttime'].notnull(),'interval'] = df_OP_inpat[df_OP_inpat['charttime'].notnull()][['hadm_id','time_since_hadm']].groupby(['hadm_id']).diff().loc[:,'time_since_hadm']
    df_OP_inpat.loc[df_OP_inpat['labtime'].notnull(),'interval'] = df_OP_inpat[df_OP_inpat['labtime'].notnull()][['hadm_id','time_since_hadm']].groupby(['hadm_id']).diff().loc[:,'time_since_hadm']
    df_OP_inpat.loc[df_OP_inpat['collection_time'].notnull(),'interval'] = df_OP_inpat[df_OP_inpat['collection_time'].notnull()][['hadm_id','time_since_hadm']].groupby(['hadm_id']).diff().loc[:,'time_since_hadm']

    # convert sex F/M into 0/1
    df_OP_inpat.loc[df_OP_inpat['gender'] == 'F', 'gender'] = 0
    df_OP_inpat.loc[df_OP_inpat['gender'] == 'M', 'gender'] = 1

    
    #order the colmns
    df_OP_inpat=df_OP_inpat[['row_num', 'hadm_id', 'obstime', 'charttime', 'labtime', 'collection_time', 'location_id',
                             'HR', 'RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU_RAW', 'GCS', 'AVPU',
                             'MASKTYPE_ID', 'FIO2_LMIN', 'FIO2_PERC', 'O2TH', 'ESTIMATED_FIO2', 'ESTIMATED_FIO2_2', 'BATEMAN_FIO2', 
                             'ALB', 'ALBUR', 'ALP', 'ALT', 'APTT', 'AST', 'BAS', 'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP', 'COCA', 
                             'DDIM', 'EGFR', 'EOS', 'GLU', 'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM', 'MCH', 'MCHC', 'MCV', 'METHB', 
                             'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD', 'TBIL', 'TROP', 'UR', 'URUR', 'WBC',
                             'subject_id', 'gender', 'age', 'weight', 'BMI', 'cci', 'hospital_code', 'treatment_function_code', 
                             'admission_time', 'discharge_time',  'firstOP_start_time', 'firstOP_end_time', 'lastOP_end_time', 
                             'time_since_hadm', 'post_firstOP_time', 'post_firstOP_time_hours', 'post_lastOP_time', 'post_lastOP_time_hours', 'interval']]

    #SQL data extraction patch
    df_OP_inpat['BASEX'].replace('.....', np.nan, inplace=True)
    
    for col in ['FIO2_PERC', 'ALB', 'ALBUR', 'ALP', 'ALT', 'APTT', 'AST', 'BAS', 'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP', 'COCA',
               'DDIM', 'EGFR', 'EOS', 'GLU', 'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM', 'MCH', 'MCHC', 'MCV', 'METHB',
               'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD', 'TBIL', 'TROP', 'UR', 'URUR', 'WBC']:
        df_OP_inpat.loc[:,col]=df_OP_inpat[col].astype(float)
      
    return df_OP_inpat

print('extraction function ready')

Define location status function

In [ ]:
# 1 = pre first OP - on the ward
# 2 = post first OP - on the ward after recovery time
# 3 = in ICU (CICU: 17, CCU: 39, Adult ICU: 87, CTV ICU: 95, Neuro ICU: 1209 
# 4 = in operative suite (Women's Day surgery and Diagnostic Suite: 35, Minor Ops Treatment room: 1228, Cardiac Angiography Suite: 58, Delivery Suite: 40 & 96, Cardiac Cath Lab E: 63,
#                         Cardiac Cath Lab A: 59, Cardiac Cath Lab B: 60, Cardiac Cath Lab D: 62, AVIC Cath Lab: 55, West Wing Main Theatre Suite Recovery: 148 & 25, Cardiac Angiography Suite Recovery: 336, 
# 5 = in nursing home (Longlands: 1646, Green Pastures: 1644, Banbury Heights: 1641, Heathfield House: 1645, Triangle: 1648 & 1649, Albany: 1640, Freeland: 1666, The Close: 1657, 
#                      Southerndown: 1667, The Ramping Cat: 1647, Acute Hospital at Home: 1710, Wyndham Hall: 1714 & 1717, St Lukes: 1656)
# 6 = on ward postoperative recovery (time window can be adapted through the recovery variable)
# 7 = errors

# Define recovery period
recovery = 1

def get_status(row, df_ICU, df_OP):
    if row['location_id'] in [17, 39, 87, 95, 1209] or row['row_num'] in df_ICU['row_num'].values:
        return 3
    
    elif row['location_id'] in [25, 35, 40, 55, 58, 59, 60, 62, 63, 96, 148, 336, 1228] or row['row_num'] in df_OP['row_num'].values:
        return 4
    
    elif row['location_id'] in [1640, 1641, 1644, 1645, 1646, 1647, 1648, 1649, 1656, 1657, 1666, 1667, 1710, 1714, 1717]:
        return 5
    
    # adapt recovery period in hours
    elif row['obstime'] > row['firstOP_end_time'] and row['obstime'] <= (row['firstOP_end_time'] + datetime.timedelta(hours=recovery)):
        return 6
    
    elif row['obstime'] >= (row['admission_time'] - datetime.timedelta(hours=24)) and row['obstime'] < row['firstOP_start_time']:
        return 1
    
    # post recovery period in hours (must match hours under number 6)
    elif row['obstime'] > (row['firstOP_end_time'] + datetime.timedelta(hours=recovery)) and row['obstime'] <= (row['discharge_time'] + datetime.timedelta(hours=24)):
        return 2
          
    else:
        return 7

print('status function ready')

Define fetch past lab and POCT data function

In [ ]:
# fetch lab and POCT value in the past within a <fetch_hours> hours time window; consider the last value of intraoperative, recovery and postop ward.    
def fetch_labs(df_fetch, df_raw, fetch_hours):
    
    def find_obs(x, interval, df):
        time = x['obstime']
        i = x['hadm_id']
        df_sub=df[(df.hadm_id == i) & (df['obstime'] <= time) & ((time - df['obstime']) <= datetime.timedelta(hours=interval)) & (df['status'].isin([2, 4, 6]))]
        
        def across_features(x, i, time):
            last_index = x.last_valid_index()
            if last_index != None:
                last = df.loc[last_index,x.name]
            else:
                last = np.nan
            
            return last
                    
        x[['ALB', 'ALBUR', 'ALP', 'ALT', 'APTT', 'AST', 'BAS', 'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP', 'COCA',
           'DDIM', 'EGFR', 'EOS', 'GLU', 'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM', 'MCH', 'MCHC', 'MCV', 'METHB',
           'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD', 'TBIL', 'TROP', 'UR', 'WBC']] = df_sub[['ALB', 'ALBUR', 
           'ALP', 'ALT', 'APTT', 'AST', 'BAS', 'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP', 'COCA',
           'DDIM', 'EGFR', 'EOS', 'GLU', 'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM', 'MCH', 'MCHC', 'MCV', 'METHB',
           'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD', 'TBIL', 'TROP', 'UR', 'WBC']].apply(across_features, args=(i,time))
        
        return x
    
    df_fetch=df_fetch.apply(find_obs, args=(fetch_hours,df_raw), axis=1)
        
    return df_fetch  

print('fetch past lab function ready')

Define vital signs imputation function

In [ ]:
# impute missing value for vital signs
def vitals_complete (df_complete):
    
    for var in ['HR','RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU', 'FIO2_LMIN', 'ESTIMATED_FIO2_2']:
        df_complete[var]=df_complete.groupby('hadm_id')[var].apply(lambda x: x.fillna(x.interpolate()))
        # to fix the abscence of interpolation if first value missing
        df_complete[var]=df_complete.groupby('hadm_id')[var].apply(lambda x: x.fillna(x.bfill()))
        # to impute any remaining missing values
        df_complete[var]=df_complete[var].fillna(df_complete[var].median())
    
    return df_complete

print('vital signs imputation function ready')

Define get NEWS score function

In [ ]:
def get_NEWS(dfs):

    def score_RR(value):
        if 21<= value <25:
            score = 2
        elif value >=25:
            score = 3
        elif 9<= value <=11:
            score = 1
        elif value <9:
            score = 3
        else:
            score = 0
        return score

    def score_SPO2(value):
        if 94<= value <=95:
            score = 1
        elif 92<= value <=93:
            score = 2
        elif value <=91:
            score = 3
        else:
            score = 0
        return score

    def score_FIO2_LMIN(value):
        if value >0:
            score = 2
        else:
            score = 0
        return score

    def score_SBP(value):
        if value >=220:
            score = 3
        elif 101<= value <=110:
            score = 1
        elif 91<= value <=100:
            score = 2
        elif value <=90:
            score = 3
        else:
            score = 0
        return score

    def score_HR(value):
        if value >=131:
            score = 3
        elif 111<= value <=130:
            score = 2
        elif 91<= value <=110:
            score = 1
        elif 41<= value <=50:
            score = 1
        elif value <=40:
            score = 3
        else:
            score = 0
        return score

    def score_AVPU(value):
        if value >1:
            score = 3
        else:
            score = 0
        return score

    def score_TEMP(value):
        if value >=39.1:
            score = 2
        elif 38.1<= value <=39:
            score = 1
        elif 35.1<= value <=36:
            score = 1
        elif value <=35:
            score = 3
        else:
            score = 0
        return score
      
    def alarm(df):
        
        if df['score_total'] >= 5:
            score = 1
        elif df['score_RR'] == 3 or df['score_SPO2'] == 3 or df['score_SBP'] == 3 or df['score_HR'] == 3 or df['score_AVPU'] == 3 or df['score_TEMP'] == 3:                
            score = 1
        else:
             score = 0
        return score
            
    dfs['score_RR'] = dfs['RR'].apply(lambda x: score_RR(x))
    dfs['score_SPO2'] = dfs['SPO2'].apply(lambda x: score_SPO2(x))
    dfs['score_FIO2_LMIN'] = dfs['FIO2_LMIN'].apply(lambda x: score_FIO2_LMIN(x))
    dfs['score_SBP'] = dfs['SBP'].apply(lambda x: score_SBP(x))
    dfs['score_HR'] = dfs['HR'].apply(lambda x: score_HR(x))
    dfs['score_AVPU'] = dfs['AVPU'].apply(lambda x: score_AVPU(x))
    dfs['score_TEMP'] = dfs['TEMP'].apply(lambda x: score_TEMP(x))
    
    dfs['score_total'] = dfs.apply(lambda row: row.score_RR + row.score_SPO2 + row.score_FIO2_LMIN + row.score_SBP + row.score_HR + row.score_AVPU + row.score_TEMP, axis = 1)
    dfs['alarm'] = dfs.apply(lambda row: alarm(row), axis = 1)
    
    return dfs

print('get NEWS function ready')

Define missingness columns function

In [ ]:
def track_missingness(df):
    
    selected_lab_var = ['ALB', 'ALBUR', 'ALP', 'ALT', 'APTT', 'AST', 'BAS', 'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP','COCA', 'DDIM', 'EGFR', 'EOS', 'GLU', 
                        'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM', 'MCH', 'MCHC', 'MCV', 'METHB', 'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD', 
                        'TBIL', 'TROP', 'UR', 'WBC']
    
    for var in selected_lab_var:
        var0 = var + '_miss'
        # create new columns tracking initial missigness
        df[var0]=np.nan
        # 1 if present, 0 if missing
        df[var0]=df.apply(lambda x: 0 if np.isnan(x[var]) else 1, axis = 1)
        
    return df

print('missingness function ready')

Define lab and POCT imputation function

In [ ]:
def impute_labs(df_complete, df_raw):

    # select imputation method (random_normal, pop_median, perso_median)
    impute_method = 'perso_median'

    # select input lab variables 
    selected_lab_var = ['ALB', 'ALBUR', 'ALP', 'ALT', 'APTT', 'AST', 'BAS', 'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP','COCA', 'DDIM', 'EGFR', 'EOS', 'GLU', 
                        'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM', 'MCH', 'MCHC', 'MCV', 'METHB', 'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD', 
                        'TBIL', 'TROP', 'UR', 'WBC']
    
    # only consider pre and post op ward values
    df_raw = df_raw[df_raw['status'].isin([1, 2, 6])]

    # from https://www.ouh.nhs.uk/biochemistry/tests/documents/biochemistry-reference-ranges.pdf 
    # and https://www.ouh.nhs.uk/services/departments/laboratory-medicine/haematology-laboratories/documents/haematology-handbook.pdf
    # and https://www.nhs.uk/conditions/kidney-disease/diagnosis/
    # and https://oxfordmedicine.com/view/10.1093/med/9780198703860.001.0001/med-9780198703860-appendix-1
    # and https://www.ouh.nhs.uk/biochemistry/tests/tests-catalogue/methaemoglobin.aspx
    # and https://www.nhs.uk/conditions/diabetic-ketoacidosis/

    # normal distribution assuming a mean of lower range + (upper range-lower range)/2 and a SD of (mean-lower range)/2 => 2SD correpsonding to a 95% confidence interval, which is also the most common definition of the normal range
    lab_references = {
        'ALB' : [41.5, 4.75], #np.arange(32, 51, 1, dtype='float'), # g/L
        'ALBUR' : [1.8, 0.9], #np.arange(0, 3.6, 0.1, dtype='float'), #  mg/mmol (0 to 2.5 mg/mmol for male) 
        'ALP' : [80.5, 25.25], #np.arange(30, 131, 1, dtype='float'), #  IU/l (age dependent) 
        'ALT' : [28, 9], #np.arange(10, 46, 1, dtype='float'), # IU/l  
        'APTT' : [25.5, 2.75], #np.arange(20, 31, 1, dtype='float'), # sec  
        'AST' : [29, 7], #np.arange(15, 43, 1, dtype='float'), # IU/L 
        'BAS' : [0.065, 0.0225], #np.arange(0.02, 0.11, 0.01, dtype='float'), # 10^9/l 
        'BASEX' : [0.5, 1.525], #np.arange(-3.0, 3.1, 0.1, dtype='float'), # mmol/L
        'BICAR' : [27, 2.5], #np.arange(22, 32, 1, dtype='float'), # mmol/L
        'CAL' : [1.205, 0.0525], #np.arange(1.1, 1.31, 0.01, dtype='float'), # mmol/l
        'CL' : [100.5, 2.75], #np.arange(95, 106, 1, dtype='float'), # mmol/L
        'CR' : [69.5, 10.25], #np.arange(49, 90, 1, dtype='float'), #  µmol/L
        'CRP' : [3, 1.5], #np.arange(0, 6, 1, dtype='float'), # mg/l
        'COCA' : [2.405, 0.1025], #np.arange(2.2, 2.61, 0.01, dtype='float'), # mmol/l
        'DDIM' : [250.5, 125.25], #np.arange(0, 501, 10, dtype='float'), # µg/l FEU
        'EGFR' : [105.5, 7.75], #np.arange(90, 121, 1, dtype='float'), # ml/min (upper bound arbitrarily set for computational reasons) 
        'EOS' : [0.265, 0.1225], #np.arange(0.02, 0.51, 0.01, dtype='float'), # 10^9/l
        'GLU' : [6.55, 1.275], #np.arange(4, 9.1, 0.1, dtype='float'), # caveat, this assume healthy patients or patients with controlled diabetes, with blood taken either before eating or 2h postprandial
        'HCT' : [0.435, 0.0375], #np.arange(0.36, 0.51, 0.01, dtype='float'), # range male: 0.40-0.50, female:0.36-0.46
        'HGB' : [14.6, 1.28], #np.arange(120, 171, 1, dtype='float'), # g/dl range male: 13.0-17.0, female: 12.0-15.0
        'INR' : [1, 0.2], #np.arange(0.6, 1.4, 0.1, dtype='float'), # no unit (ratio)
        'KET' : [0.3, 0.15], #np.arange(0, 0.6, 0.1, dtype='float'), # mmol/l
        'LAC' : [1.3, 0.4], #np.arange(0.5, 2.1, 0.1, dtype='float'), # mmol/L 
        'LYM' : [2.55, 0.775], #np.arange(1, 4.1, 0.1, dtype='float'), # 10^9/l 
        'MCH' : [29.55, 1.275], #np.arange(27, 32.1, 0.1, dtype='float'), # pg 
        'MCHC' : [330.5, 7.75], #np.arange(315, 346, 1, dtype='float'), # g/l 
        'MCV' : [92, 4.5], #np.arange(83, 101, 1, dtype='float'), # fl 
        'METHB' : [0.8, 0.4], #np.arange(0, 1.6, 0.1, dtype='float'), # % 
        'MON' : [0.605, 0.2025], #np.arange(0.2, 1.01, 0.01, dtype='float'), # 10^9/l 
        'MPV' : [10.55, 0.775], #np.arange(9, 12.1, 0.1, dtype='float'), # fl 
        'NEU' : [4.55, 1.275], #np.arange(2, 7.1, 0.1, dtype='float'), # 10^9/l 
        'OSM' : [287, 4.5], #np.arange(278, 296, 1, dtype='float'), # mosmol/kg 
        'PH' : [7.405, 0.0175], #np.arange(7.37, 7.44, 0.01, dtype='float'), # no unit 
        'PLT' : [275.5, 62.75], #np.arange(150, 401, 1, dtype='float'), # 10^9/l 
        'POT' : [4.3, 0.4], #np.arange(3.5, 5.1, 0.1, dtype='float'), # mmol/L 
        'RBC' : [4.7, 0.45], #np.arange(3.8, 5.6, 0.1, dtype='float'), # 10^12/l (male range:4.5-5.5, female range:3.8-4.8)
        'SOD' : [140.5, 2.75], #np.arange(135, 146, 1, dtype='float'), # mmol/l  
        'TBIL' : [11, 5.5], #np.arange(0, 22, 1, dtype='float'), # µmol/L 
        'TROP' : [17.5, 8.75], #np.arange(0, 35, 1, dtype='float'), #  ng/L (male:0-34 ng/L, female: 0-17 ng/L) 
        'UR' : [5.9, 1.7], #np.arange(2.5, 9.3, 0.1, dtype='float'), # mmol/l (age and sex specific) 
        'URUR' : [5.9, 1.7], #np.arange(2.5, 9.3, 0.1, dtype='float'), # mmol/l (age and sex specific) 
        'WBC' : [7.55, 1.775], #np.arange(4, 11.1, 0.1, dtype='float') # 10^9/l       
    }

    def personal_median(row, var, df_ref):
        time = row['obstime']
        hadm = row['hadm_id']
        median = df_ref[(df_ref.hadm_id == hadm) & (df_ref['obstime'] <= time)][var].median()
        return median

    if impute_method == 'random_normal':
        np.random.seed(2206)
        for var in selected_lab_var:
            # impute lab value based on a random draw from the physiological range
            df_complete[var] = df_complete[var].apply(lambda x: round(np.random.normal(loc=lab_references[var][0], scale=lab_references[var][1]),2) if np.isnan(x) else x)
            # set negative values to 0
            df_complete[var] = df_complete[var].apply(lambda x: x if x>0 else 0)

    elif impute_method == 'pop_median':
        for var in selected_lab_var:
            # impute lab value based on the population median
            df_complete[var]=df_complete[var].fillna(df_complete[var].median())

    elif impute_method == 'perso_median':
        selected_lab_var.remove('BASEX') # allow distribution of BASEX in negative value
        for var in selected_lab_var:
            # impute lab value based on the admission non-missing anterior values median if any available including pre op
            df_complete[var]=df_complete.apply(lambda x: personal_median(x, var, df_raw) if np.isnan(x[var]) else x[var], axis = 1)
            # impute remaining missing values with random draw from the physiological range
            df_complete[var] = df_complete[var].apply(lambda x: round(np.random.normal(loc=lab_references[var][0], scale=lab_references[var][1]),2) if np.isnan(x) else x)
            # set negative values to 0
            df_complete[var] = df_complete[var].apply(lambda x: x if x>0 else 0)
        
        # tailored imputation for 'BASEX' (allow distribution of BASEX in negative value)
        # impute lab value based on the admission non-missing anterior values median if any available
        df_complete['BASEX']=df_complete.apply(lambda x: personal_median(x, 'BASEX', df_raw) if np.isnan(x['BASEX']) else x['BASEX'], axis = 1)
        # impute remaining missing values with random draw from the physiological range
        df_complete['BASEX'] = df_complete['BASEX'].apply(lambda x: round(np.random.normal(loc=lab_references['BASEX'][0], scale=lab_references['BASEX'][1]),2) if np.isnan(x) else x)
        
        # set max EGFR value to 90
        df_complete['EGFR'] = df_complete['EGFR'].apply(lambda x: x if x<=90 else 90)

    else:
        print("please define correct imputation method")

    return df_complete
        
print('labs and POCT imputation function ready')

Define min-max function (only for vitals)

In [ ]:
def min_max(df, df_support):
    
    selected_var = ['HR','RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU', 'FIO2_LMIN', 'ESTIMATED_FIO2_2']
    df_support = df_support[df_support['status'].isin([2, 6])]
    
    def amplitude_24(row, var, df_support):
        time = row['obstime']
        hadm = row['hadm_id']
        # fetch 24h window in df_raw, only post recovery ward data
        df_prov = df_support[(df_support['hadm_id'] == hadm) & (df_support['obstime'] > (time - datetime.timedelta(hours=24))) & (df_support['obstime'] <= time) & (df_support['status'] == 2)]
        # to diferentiate true 0 difference from time window with only 1 datapoint
        if np.sum(df_prov[var].count())<=1:
            minmax = np.nan
        else:
            minmax = df_prov[var].max() - df_prov[var].min()
        
        return minmax
    
    def impute_amplitude_24(row, var, df_support):
        time = row['obstime']
        hadm = row['hadm_id']
        # get hadm_id median in df_raw, only ward data
        df_prov = df_support[(df_support['hadm_id'] == hadm) & (df_support['obstime'] < time)]
        # if median is not nan, set minmax as absolute value of difference between value and median of previous observation
        if np.isnan(df_prov[var].median()) == False:
            minmax = abs(row[var] - df_prov[var].median())
        else:
            minmax = 0
        
        return minmax
    
    for var in selected_var:
        var0 = var + '_minmax'
        # create new columns looking at maximal amplitude in the last 24 hours
        df[var0]=np.nan
        # min-max amplitude in the last 24h
        df[var0]=df.apply(lambda x: amplitude_24(x, var, df_support), axis = 1)
        # impute difference from personal median if minmax = nan
        df[var0]=df.apply(lambda x: impute_amplitude_24(x, var, df_support) if np.isnan(x[var0]) else x[var0], axis = 1)
        
    return df

print('min-max function ready')

Define Delta function for vitals

In [ ]:
def delta_vitals(df, df_support):
    
    selected_var_vitals = ['HR','RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU', 'FIO2_LMIN', 'ESTIMATED_FIO2_2']
    df_support_vital1 = df_support[(df_support['charttime'].notnull()) & (df_support['status']==2)]
    df_support_vital2 = df_support[(df_support['charttime'].notnull()) & (df_support['status'].isin([1, 2]))]
    
    # delta column for vital signs values
    for var in selected_var_vitals:
        var0 = var + '_delta'
        # create new columns looking at changes in 12 hours median value since 6 hours ago.
        df[var0]=np.nan
    
    def delta_24(row, var, df_support_vital1, df_support_vital2):
        time = row['obstime']
        hadm = row['hadm_id']
        # fetch difference between 12 hours median and 12h median 6 hours before (only ward post recovery)
        df_prov1 = df_support_vital1[(df_support_vital1['hadm_id'] == hadm) & (df_support_vital1['obstime'] > (time - datetime.timedelta(hours=12))) & (df_support_vital1['obstime'] <= time)]
        df_prov1_delta6 = df_support_vital1[(df_support_vital1['hadm_id'] == hadm) & (df_support_vital1['obstime'] > (time - datetime.timedelta(hours=18))) & (df_support_vital1['obstime'] <= (time - datetime.timedelta(hours=6)))]
        df_median = df_support_vital2[(df_support_vital2['hadm_id'] == hadm) & (df_support_vital2['obstime'] < time)]
        
        for var in selected_var_vitals:
            column = var + '_delta'
            # fetch hadm_id past median value including pre-op time
            median0 = df_prov1[var].median()
            median6 = df_prov1_delta6[var].median()
            median_hadm = df_median[var].median()
        
            if (np.isnan(median0) == False) & (np.isnan(median6) == False):
                row[column] = round((median0 - median6), 2)
            elif (np.isnan(median_hadm) == False) & (np.isnan(median0) == False):
                # if row is the first observation within 18 hours, compare it to the hospital admission's anterior median (including preOP values)
                row[column] = round(median0 - median_hadm, 2)   
            else:
                # if none available, consider delta as 0  
                row[column] = 0
        
        return row
    
    df=df.apply(lambda x: delta_24(x, selected_var_vitals, df_support_vital1, df_support_vital2), axis = 1)
   
    return df

print('delta vitals function ready')

Define Delta function for labs + POCT

In [ ]:
def delta_labs(df, df_support):
    
    # normal distribution assuming a mean of lower range + (upper range-lower range)/2 and a SD of (mean-lower range)/2 => 2SD correpsonding to a 95% confidence interval, which is also the most common definition of the normal range
    lab_references = {
        'ALB' : [41.5, 4.75], #np.arange(32, 51, 1, dtype='float'), # g/L
        'ALBUR' : [1.8, 0.9], #np.arange(0, 3.6, 0.1, dtype='float'), #  mg/mmol (0 to 2.5 mg/mmol for male) 
        'ALP' : [80.5, 25.25], #np.arange(30, 131, 1, dtype='float'), #  IU/l (age dependent) 
        'ALT' : [28, 9], #np.arange(10, 46, 1, dtype='float'), # IU/l  
        'APTT' : [25.5, 2.75], #np.arange(20, 31, 1, dtype='float'), # sec  
        'AST' : [29, 7], #np.arange(15, 43, 1, dtype='float'), # IU/L 
        'BAS' : [0.065, 0.0225], #np.arange(0.02, 0.11, 0.01, dtype='float'), # 10^9/l 
        'BASEX' : [0.5, 1.525], #np.arange(-3.0, 3.1, 0.1, dtype='float'), # mmol/L
        'BICAR' : [27, 2.5], #np.arange(22, 32, 1, dtype='float'), # mmol/L
        'CAL' : [1.205, 0.0525], #np.arange(1.1, 1.31, 0.01, dtype='float'), # mmol/l
        'CL' : [100.5, 2.75], #np.arange(95, 106, 1, dtype='float'), # mmol/L
        'CR' : [69.5, 10.25], #np.arange(49, 90, 1, dtype='float'), #  µmol/L
        'CRP' : [3, 1.5], #np.arange(0, 6, 1, dtype='float'), # mg/l
        'COCA' : [2.405, 0.1025], #np.arange(2.2, 2.61, 0.01, dtype='float'), # mmol/l
        'DDIM' : [250.5, 125.25], #np.arange(0, 501, 10, dtype='float'), # µg/l FEU
        'EGFR' : [105.5, 7.75], #np.arange(90, 121, 1, dtype='float'), # ml/min (upper bound arbitrarily set for computational reasons) 
        'EOS' : [0.265, 0.1225], #np.arange(0.02, 0.51, 0.01, dtype='float'), # 10^9/l
        'GLU' : [6.55, 1.275], #np.arange(4, 9.1, 0.1, dtype='float'), # caveat, this assume healthy patients or patients with controlled diabetes, with blood taken either before eating or 2h postprandial
        'HCT' : [0.435, 0.0375], #np.arange(0.36, 0.51, 0.01, dtype='float'), # range male: 0.40-0.50, female:0.36-0.46
        'HGB' : [14.6, 1.28], #np.arange(120, 171, 1, dtype='float'), # g/dl range male: 13.0-17.0, female: 12.0-15.0
        'INR' : [1, 0.2], #np.arange(0.6, 1.4, 0.1, dtype='float'), # no unit (ratio)
        'KET' : [0.3, 0.15], #np.arange(0, 0.6, 0.1, dtype='float'), # mmol/l
        'LAC' : [1.3, 0.4], #np.arange(0.5, 2.1, 0.1, dtype='float'), # mmol/L 
        'LYM' : [2.55, 0.775], #np.arange(1, 4.1, 0.1, dtype='float'), # 10^9/l 
        'MCH' : [29.55, 1.275], #np.arange(27, 32.1, 0.1, dtype='float'), # pg 
        'MCHC' : [330.5, 7.75], #np.arange(315, 346, 1, dtype='float'), # g/l 
        'MCV' : [92, 4.5], #np.arange(83, 101, 1, dtype='float'), # fl 
        'METHB' : [0.8, 0.4], #np.arange(0, 1.6, 0.1, dtype='float'), # % 
        'MON' : [0.605, 0.2025], #np.arange(0.2, 1.01, 0.01, dtype='float'), # 10^9/l 
        'MPV' : [10.55, 0.775], #np.arange(9, 12.1, 0.1, dtype='float'), # fl 
        'NEU' : [4.55, 1.275], #np.arange(2, 7.1, 0.1, dtype='float'), # 10^9/l 
        'OSM' : [287, 4.5], #np.arange(278, 296, 1, dtype='float'), # mosmol/kg 
        'PH' : [7.405, 0.0175], #np.arange(7.37, 7.44, 0.01, dtype='float'), # no unit 
        'PLT' : [275.5, 62.75], #np.arange(150, 401, 1, dtype='float'), # 10^9/l 
        'POT' : [4.3, 0.4], #np.arange(3.5, 5.1, 0.1, dtype='float'), # mmol/L 
        'RBC' : [4.7, 0.45], #np.arange(3.8, 5.6, 0.1, dtype='float'), # 10^12/l (male range:4.5-5.5, female range:3.8-4.8)
        'SOD' : [140.5, 2.75], #np.arange(135, 146, 1, dtype='float'), # mmol/l  
        'TBIL' : [11, 5.5], #np.arange(0, 22, 1, dtype='float'), # µmol/L 
        'TROP' : [17.5, 8.75], #np.arange(0, 35, 1, dtype='float'), #  ng/L (male:0-34 ng/L, female: 0-17 ng/L) 
        'UR' : [5.9, 1.7], #np.arange(2.5, 9.3, 0.1, dtype='float'), # mmol/l (age and sex specific) 
        'WBC' : [7.55, 1.775], #np.arange(4, 11.1, 0.1, dtype='float') # 10^9/l       
    }
    
    selected_var_labs = ['ALB', 'ALBUR', 'ALP', 'ALT', 'APTT', 'AST', 'BAS', 'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP','COCA', 'DDIM', 'EGFR', 'EOS', 'GLU', 
                        'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM', 'MCH', 'MCHC', 'MCV', 'METHB', 'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD', 
                        'TBIL', 'TROP', 'UR', 'WBC']
    
    df_support['labPOCT_time'] = df_support['labtime'].combine_first(df_support['collection_time'])
    df_support_lab1 = df_support[(df_support['labPOCT_time'].notnull()) & (df_support['status'].isin([2, 4, 6]))]
    df_support_lab2 = df_support[(df_support['labPOCT_time'].notnull()) & (df_support['status'].isin([1, 2]))]
    
    # delta column for labs and POCT values
    for var in selected_var_labs:
        var0 = var + '_delta'
        # create new columns looking at changes in 12 hours median value since 6 hours ago.
        df[var0]=np.nan
    
    def delta_50(row, var, df_support_lab1, df_support_lab2, lab_references):
        time = row['obstime']
        hadm = row['hadm_id']
        # fetch difference between value and last recorded value within 50h
        df_prov1 = df_support_lab1[(df_support_lab1['hadm_id'] == hadm) & (df_support_lab1['obstime'] > (time - datetime.timedelta(hours=50))) & (df_support_lab1['obstime'] <= time)]
        df_prov2 = df_support_lab2[(df_support_lab2['hadm_id'] == hadm) & (df_support_lab2['obstime'] < time)]
  
        for var in selected_var_labs:
            column = var + '_delta'
            # fetch hadm_id past median value including pre-op time
            median = df_prov2[var].median()
        
            if (np.sum(df_prov1[var].count())>=2):          
                last_lab_time1 = time - df_prov1[df_prov1[var].notnull()].iloc[-1]['obstime']
                last_lab_time2 = time - df_prov1[df_prov1[var].notnull()].iloc[-2]['obstime']
                
                # makes sure the last value in the table is less than 26 hours old (otherwise the value would be an imputed value) 
                # and that there are more than 6 hours between 2 measurements (otherwise small variation in measurment accuracy would have disproportionate influence on the value standardised over 24)
                if (last_lab_time1/np.timedelta64(1, 'm') <= 1560) & ((last_lab_time1 - last_lab_time2) >= datetime.timedelta(hours=6)):
                    # difference between current lab value (may have been fetched forward) and the last one recorded within a 50h period
                    # add a coefficient to scale the difference to a 24 hour time window
                    delta_time = df_prov1[df_prov1[var].notnull()].iloc[-1]['obstime'] - df_prov1[df_prov1[var].notnull()].iloc[-2]['obstime']
                    row[column] = round((df_prov1[df_prov1[var].notnull()].iloc[-1][var] - df_prov1[df_prov1[var].notnull()].iloc[-2][var])*(1440/(delta_time/np.timedelta64(1, 'm'))), 2)
                
           # all the following lines assume the change from baseline happened in 24h    
                elif np.isnan(median) == False:
                    # difference from admission median
                    row[column] = round((row[var] - median), 2)
                else:
                    # difference from the mean value of the normal range
                    baseline = lab_references[var][0]
                    row[column] = round((row[var] - baseline), 2)

            elif np.isnan(median) == False:
                # difference from admission median
                row[column] = round((row[var] - median), 2)
            else: 
                # difference from the mean value of the normal range
                baseline = lab_references[var][0]
                row[column] = round((row[var] - baseline), 2)
        
        return row
    
    df=df.apply(lambda x: delta_50(x, selected_var_labs, df_support_lab1, df_support_lab2, lab_references), axis = 1)
        
    return df

print('delta labs function ready')

Define get outcomes function (events, procedures and drug prescriptions)

In [ ]:
def get_outcomes(dfo, time_icu, time_arrest, time_death, time_reOP, first_day_procedure, max_procedure_day, max_prescription_time):
    
    mydb = mysql.connector.connect(
    host="mysql8",
    user= os.getenv('USERNAMEH'),
    passwd= os.getenv('PASSWORDH'),
    database="havenouh_v15"
    )

    # list of admissions with unplanned ICU admission with ICU admission times
    query10 = """
    SELECT DISTINCT ad.hadm_id, icu.admission_time
    FROM havenouh_v15.admissions AS ad
    LEFT JOIN havenouh_v15.theatre_visits AS tv ON (ad.hadm_id = tv.hadm_id)
    LEFT JOIN havenouh_v15.icustays AS icu ON (icu.subject_id = ad.subject_id)
    LEFT JOIN havenouh_v15.chartevents AS ch ON (icu.subject_id = ch.subject_id)
    WHERE ad.hadm_id IN (""" + pop.string + """)
    AND (icu.admission_time BETWEEN ad.admission_time AND ad.discharge_time)
    AND TIMESTAMPDIFF(minute, tv.end_time, icu.admission_time) >0
    AND (ch.charttime BETWEEN tv.end_time AND icu.admission_time)
    AND (icu.admission_type IN ('Unplanned local admission', 'Unplanned transfer in') OR icu.admission_type IS NULL)
    ORDER BY ad.hadm_id, icu.admission_time;
    """
    df_icu_admissions= pd.read_sql(query10, mydb)

    # list of admissions with cardiac arrests after stay on the ward with arrest time
    query11 = """
    SELECT DISTINCT ad.hadm_id, ar.arrest_time
    FROM havenouh_v15.admissions AS ad
    LEFT JOIN havenouh_v15.theatre_visits AS tv ON (ad.hadm_id = tv.hadm_id)
    LEFT JOIN havenouh_v15.arrestcalls AS ar ON (ar.subject_id = ad.subject_id)
    LEFT JOIN havenouh_v15.chartevents AS ch ON (ch.subject_id = ad.subject_id)
    WHERE ad.hadm_id IN (""" + pop.string + """)
    AND ar.arrest_time BETWEEN ad.admission_time AND ad.discharge_time
    AND TIMESTAMPDIFF(minute, tv.end_time, ar.arrest_time) >0
    AND (ch.charttime BETWEEN tv.end_time AND ar.arrest_time)
    ORDER BY ad.hadm_id, ar.arrest_time;
    """
    df_cardiac_arrest = pd.read_sql(query11, mydb)

    # list of in hospital mortality after stay on the ward with time of death
    query12 = """
    SELECT DISTINCT ad.hadm_id, ad.discharge_time
    FROM havenouh_v15.admissions AS ad
    LEFT JOIN havenouh_v15.theatre_visits AS tv ON (ad.hadm_id = tv.hadm_id)
    LEFT JOIN havenouh_v15.chartevents AS ch ON (ch.subject_id = ad.subject_id)
    WHERE ad.hadm_id IN (""" + pop.string + """)
    AND ch.charttime BETWEEN tv.end_time AND ad.discharge_time
    AND ad.discharge_method = 4
    ORDER BY ad.hadm_id, ad.discharge_time;
    """
    df_mortality = pd.read_sql(query12, mydb)

    # list of operations with time of operation
    query13 = """
    SELECT DISTINCT ad.hadm_id, tv.start_time
    FROM havenouh_v15.admissions AS ad
    LEFT JOIN havenouh_v15.theatre_visits AS tv ON (tv.hadm_id = ad.hadm_id)
    WHERE ad.hadm_id IN (""" + pop.string + """)
    ORDER BY ad.hadm_id;
    """
    df_OP = pd.read_sql(query13, mydb)

    # load OPCS codes dataframe
    infile = open('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/OPCS_codes','rb')
    df_opcs = pickle.load(infile)
    infile.close()
    
    # list of procedures (on the operation day or after) with time of procedures
    query14 = """
    SELECT fi.hadm_id, pr.procedure_date, pr.procedure_code
    FROM (
        SELECT min(end_time) AS OP_end_time, hadm_id
        FROM havenouh_v15.theatre_visits
        GROUP BY hadm_id) AS fi
    LEFT JOIN havenouh_v15.procedures AS pr ON (fi.hadm_id = pr.hadm_id)
    WHERE fi.hadm_id IN (""" + pop.string + """)
    AND day(pr.procedure_date) >= day(OP_end_time)
    ORDER BY fi.hadm_id;
    """
    df_procedures_ = pd.read_sql(query14, mydb)
    df_procedures = pd.merge(df_procedures_, df_opcs, how='left', on='procedure_code')

    # list of prescriptions with time of presciptions
    query15 = """
    SELECT prescription_id, hadm_id, order_mnemonic as drug_name, drug_form, drug_route, dose, frequency, subject_own_meds, order_placed_time as prescription_time
    FROM user_baptiste_vasey.list_prescriptions
    WHERE hadm_id IN (""" + pop.string + """)
    ORDER BY hadm_id, order_placed_time;
    """
    df_prescriptions = pd.read_sql(query15, mydb)

    mydb.close()   
    
    # calculate if an unplanned ICU admission occured within <time_icu> hours of a contact point
    def get_icu(df_icu):    
        hadm_icu = df_icu['hadm_id']
        session_time = df_icu['charttime']
        df_icu_temp = df_icu_admissions[df_icu_admissions['hadm_id'] == hadm_icu]
            
        if df_icu_temp.empty == True:
            score = 0    
        else:
            if df_icu_temp['admission_time'].apply(lambda x: True if (0 < (x-session_time).total_seconds()/(60*60) <= time_icu) else False).any() == True:
                score = 1
            else:  
                score = 0
        
        return score
    
    # calculate if cardiac arrest occured within <time_arrest> hours of a contact point
    def get_arrest(df_arrest):   
        hadm_arrest = df_arrest['hadm_id']
        session_time = df_arrest['charttime']
        df_arrest_temp = df_cardiac_arrest[df_cardiac_arrest['hadm_id'] == hadm_arrest]
            
        if df_arrest_temp.empty == True:     
            score = 0    
        else:    
            if df_arrest_temp['arrest_time'].apply(lambda x: True if (0 < (x-session_time).total_seconds()/(60*60) <= time_arrest) else False).any() == True:
                score = 1
            else:      
                score = 0
        
        return score
    
    # calculate if a patient died within <time_death> hours of a contact point
    def get_death(df_death):
            
        hadm_death = df_death['hadm_id']
        session_time = df_death['charttime']
        df_death_temp = df_mortality[df_mortality['hadm_id'] == hadm_death]
            
        if df_death_temp.empty == True:    
            score = 0
        else:   
            if df_death_temp['discharge_time'].apply(lambda x: True if (0 < (x-session_time).total_seconds()/(60*60) <= time_death) else False).any() == True:
                score = 1
            else:     
                score = 0
        
        return score

    # calculate if a readmission to theatre occured within <time_reOP> hours of a contact point
    def get_reOP(df_reOP):
            
        hadm_reOP = df_reOP['hadm_id']
        session_time = df_reOP['charttime']
        df_reOP_temp = df_OP[df_OP['hadm_id'] == hadm_reOP]
        
        if df_reOP_temp.empty == True:     
            score = 0
        else:
            if df_reOP_temp['start_time'].apply(lambda x: True if (0 < (x-session_time).total_seconds()/(60*60) <= time_reOP) else False).any() == True:
                score = 1   
            else:        
                score = 0
        
        return score
    
    # fetch all procedures booked on the <max_procedure_day> days following a contact point (the procedure metadata does not allow for a better time discrimination)
    def get_procedures(df_proc, df_procedures, first_day_procedure, max_procedure_day):
            
        hadm_proc = df_proc['hadm_id']
        session_day = df_proc['charttime'].date()
        min_day = datetime.datetime.combine((session_day + datetime.timedelta(days=first_day_procedure)), datetime.datetime.min.time())
        max_day = datetime.datetime.combine((session_day + datetime.timedelta(days=max_procedure_day)), datetime.datetime.min.time())
        df_proc_temp = df_procedures[(df_procedures['hadm_id'] == hadm_proc) & (df_procedures['procedure_date'] >= min_day) & (df_procedures['procedure_date'] <= max_day)].reset_index(drop=True)
            
        if df_proc_temp.empty == True:    
            procedures = ['no procedure']
        else:
            procedures = df_proc_temp['procedure_name'].to_list() 
            
        return procedures
    
    # fetch all drugs prescribed during the <max_prescription_time> hours following a contact point 
    def get_drugs(df_drugs):
        
        #subject_own_meds = ['POD Ward - OUH supplied', 'POD Home', 'PODP - brought in', 'POD Ward - brought in', 'PODP - OUH supplied', 'POD Fridge - brought in', 
        #                 'PODCD - OUH supplied', 'Relabel - OUH supplied', 'Relabel - brought in', 'Community Hospital Stock', 'PODCD - brought in', 'POD Fridge - OUH supplied']
        hadm_drugs = df_drugs['hadm_id']
        session_time = df_drugs['charttime']
        df_drugs_temp = df_prescriptions[(df_prescriptions['hadm_id'] == hadm_drugs) & df_prescriptions['subject_own_meds'].isnull()].drop_duplicates(subset = 'drug_name', keep = 'first')
  
        df_new_drugs = df_drugs_temp[(df_drugs_temp['prescription_time'] >= session_time) & (df_drugs_temp['prescription_time'] <= (session_time + datetime. timedelta(hours=max_prescription_time)))].reset_index(drop = True)
        
        if df_new_drugs.empty == True:    
            new_drugs = ['no new drugs']
        else:     
            new_drugs = []
            for i in df_new_drugs.index:
                new_drugs.append(df_new_drugs.loc[i,'drug_name'])
            
        return new_drugs
    
    # calculate the total number of observations sets for the same hospital admission
    def get_number(df_number):
        hadm_number = df_number['hadm_id']
        number = len(dfo[dfo['hadm_id'] == hadm_number].index)
        return number
   

    # create a new column where value = 1 if ICU admission in the next "time_icu" hours
    dfo['icu_event'] = dfo.apply(lambda row: get_icu(row), axis = 1)
    print ('icu done')
    
    # create a new column where value = 1 if cardiac arrest in the next "time_arrest" hours
    dfo['arrest_event'] = dfo.apply(lambda row: get_arrest(row), axis = 1)
    print ('arrest done')

    # create a new column where value = 1 if patient death in the next "time_death" hours
    dfo['mortality_event'] = dfo.apply(lambda row: get_death(row), axis = 1)
    print ('death done')
    
    # create a new column where value = 1 if re-operation in the next "time_reOP" hours
    dfo['reOP_event'] = dfo.apply(lambda row: get_reOP(row), axis = 1)
    print ('reOP done')
    
    # create a new column where value = list of procedures (OPCS codes) in the next "time_procedures" days
    dfo['procedures'] = dfo.apply(lambda row: get_procedures(row, df_procedures, first_day_procedure, max_procedure_day), axis = 1)
    print ('procedures done')
    
    # create a new column where value = list of procedures (OPCS codes) in the next "time_procedures" days
    dfo['new_prescriptions'] = dfo.apply(lambda row: get_drugs(row), axis = 1)
    print ('prescriptions done')
    
    # create a new column where value = total number of observations sets for the same hospital admission
    dfo['number_of_sessions'] = dfo.apply(lambda row: get_number(row), axis = 1)
    
    dfo.loc[(dfo.icu_event == 1) | (dfo.arrest_event == 1) | (dfo.mortality_event == 1) | (dfo.reOP_event == 1), 'event'] = 1
    dfo.loc[(dfo.icu_event != 1) & (dfo.arrest_event != 1) & (dfo.mortality_event != 1) & (dfo.reOP_event != 1), 'event'] = 0
    
    return dfo

print('get outcome function ready')

Define refine_prescriptions and refine_procedures

In [ ]:
def refine_prescriptions (df_pres, df_trans_drug):
    
    df_pres["new_fluid"] = np.NaN
    df_pres["new_prescriptions_grouped"] = ''
    df_pres.reset_index(inplace=True)

    # select and group prescriptions according to the document list_prescriptions_final.cvs
    for i in df_pres.index:
        pres_old = pd.DataFrame(df_pres['new_prescriptions'].iloc[i], columns=['new_prescriptions'])
        pres_new = pd.merge(pres_old, df_trans_drugs, left_on = 'new_prescriptions', right_on = 'prescription')

        # volume is considered as prescription if a patient receives more than 1000ml in 24h
        new_fluid = pres_new.quantity_ml.sum() 
        df_pres.loc[i, 'new_fluid'] = new_fluid
        if new_fluid >= 1000:
            new_row = {'new_prescriptions':'volume_tot', 'prescription':'volume_tot', 'category':'volume_tot', 'group':'volume_tot', 'included':1, 'fluid':'fluid', 'quantity_ml':new_fluid}
            pres_new = pres_new.append(new_row, ignore_index=True)

        prescriptions = pres_new[pres_new.included == 1]['group'].drop_duplicates().tolist()
        if len(prescriptions) == 0:    
             prescriptions = ['no_new_drugs']
        df_pres.at[i, 'new_prescriptions_grouped'] = prescriptions

    # Add a column with the number of new prescriptions (no filtering)
    df_pres['number_prescriptions'] = np.nan
    for i in df_pres.index:
        if df_pres['new_prescriptions'].iloc[i][0] == 'no new drugs':
            df_pres.loc[i, 'number_prescriptions'] = 0
        else:
            df_pres.loc[i, 'number_prescriptions'] = len(df_pres['new_prescriptions'].iloc[i])    

    # Add a column with the number of new prescriptions groups (selected groups of interest)   
    df_pres['number_prescriptions_groups'] = np.nan
    for i in df_pres.index:
        if df_pres['new_prescriptions_grouped'].iloc[i][0] == 'no_new_drugs':
            df_pres.loc[i, 'number_prescriptions_groups'] = 0
        else:
            df_pres.loc[i, 'number_prescriptions_groups'] = len(df_pres['new_prescriptions_grouped'].iloc[i])

    return df_pres       
            
     
def refine_procedures (df_proc, df_trans_proc):

    df_proc['new_procedures_grouped'] = ''
    df_proc.reset_index(inplace=True)

     # select and group procedures according to the document list_procedures_final.cvs
    for i in df_proc.index:
        proc_old = pd.DataFrame(df_proc['procedures'].iloc[i], columns=['procedures'])
        proc_new = pd.merge(proc_old, df_trans_proc, left_on = 'procedures', right_on = 'procedures')
        procedures = proc_new[proc_new.included == 1]['group'].drop_duplicates().tolist()
        if len(procedures) == 0:    
             procedures = ['no procedure']
        df_proc.at[i, 'new_procedures_grouped'] = procedures
        
    # Add a column with the number of new procedure groups (selected groups of interest)
    df_proc['number_procedures_groups'] = np.nan
    for i in df_proc.index:
        if df_proc['new_procedures_grouped'].iloc[i][0] == 'no procedure':
            df_proc.loc[i, 'number_procedures_groups'] = 0
        else:
            df_proc.loc[i, 'number_procedures_groups'] = len(df_proc['new_procedures_grouped'].iloc[i])

    return df_proc          
            
print('refine outcome functions ready')        

Process data extraction - part 1 - raw dataframe

In [ ]:
#select populations of interest
pop = pop4

#select fetch windows of interest (for labs and POCT)
selected_fetch_window = 26 

#---------------------------------------------------------------------------------------------------------------

start = datetime.datetime.now()

#extract the data
df_raw = extraction(pop)
df_raw = df_raw.drop(columns='URUR') # remove one of the variable found later in the analysis to have been mistakenly included
print('extraction completed')
print('number of data colection timepoints (rows): ' + str(len(df_raw.index)))
print('number of hospital admissions: ' + str(len(df_raw['hadm_id'].unique())))

#----------------------------------------------------------------------------------------------------------------

# generate the necessary support databases for the get_status function

# connect to database
mydb = mysql.connector.connect(
    host="mysql8",
    user= os.getenv('USERNAMEH'),
    passwd= os.getenv('PASSWORDH'),
    database="havenouh_v15"
)

# get list of all observation taken while in ICU
query3 = """
SELECT vlp.*, 
    co.subject_id, co.gender, co.age, co.hospital_code, co.treatment_function_code, co.admission_time, co.discharge_time, co.cci, co.weight, co.BMI,
    icu.admission_time as ICU_admission_time, icu.discharge_time as ICU_discharge_time
FROM user_baptiste_vasey.vitals_labs_POCTs vlp
LEFT JOIN user_baptiste_vasey.cohort co
ON vlp.hadm_id = co.hadm_id
LEFT JOIN (
    SELECT subject_id, admission_time, discharge_time
    FROM havenouh_v15.icustays
    ORDER BY subject_id
    ) icu
ON co.subject_id = icu.subject_id
WHERE vlp.hadm_id IN (""" + pop.string + """) 
AND vlp.obstime >= icu.admission_time
AND vlp.obstime <= icu.discharge_time
ORDER BY vlp.hadm_id, vlp.obstime;"""
df_list_ICUobs = pd.read_sql(query3, mydb)

# get list of all observation taken while in theatre or <1 hour postop
query4 = """
SELECT vlp.*, 
        co.subject_id,
        th.start_time as OP_start_time, th.end_time as OP_end_time
FROM user_baptiste_vasey.vitals_labs_POCTs vlp
LEFT JOIN user_baptiste_vasey.cohort co
ON vlp.hadm_id = co.hadm_id
LEFT JOIN (
    SELECT hadm_id, start_time, end_time
    FROM havenouh_v15.theatre_visits
    ) th
ON vlp.hadm_id = th.hadm_id
WHERE vlp.hadm_id IN (""" + pop.string + """) 
AND vlp.obstime >= th.start_time
AND vlp.obstime <= DATE_ADD(th.end_time, INTERVAL 1 HOUR)
ORDER BY vlp.hadm_id, vlp.obstime;"""
df_list_OPobs = pd.read_sql(query4, mydb)
mydb.close()

# attribute provenance status
df_raw['status'] = df_raw.apply(lambda x: get_status(x, df_list_ICUobs, df_list_OPobs), axis = 1)
print('status completed')

#-------------------------------------------------------------------------------------------------

end = datetime.datetime.now()
run_time = (end - start)
print('run time: ' + str(run_time))

Process data extraction - part 2 - starting point ward dataframe

In [ ]:
#select populations of interest
pop = pop4

#select fetch windows of interest (for labs and POCT)
selected_fetch_window = 26 

start = datetime.datetime.now()
print(pop.name)

#---------------------------------------------------------------------------------------------------------------

# create a column counting the number of existing observation in a set (discard sets with only one type of obs, such as temperature)
df_raw['n_obs_set'] = np.nan
df_raw['n_obs_set'] = df_raw[['HR','RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU', 'FIO2_LMIN']].apply(lambda x: x.count(), axis=1)

# create a dataframe with all observation set taken on the ward, with lab and POCT values fetch in the predefined window => used to quantify missing data
df_start=df_raw[(df_raw['n_obs_set'] >= 6) & (df_raw['status'].isin([2, 6]))].copy(deep=True)
df_start.drop_duplicates(subset=['obstime'], inplace=True)
df_start = fetch_labs(df_start, df_raw, selected_fetch_window)
print('initial database complete')
print('number of contact points (rows): ' + str(len(df_start.index)))
print('number of hospital admissions: ' + str(len(df_start['hadm_id'].unique())))
                                                   
#----------------------------------------------------------------------------                                                   
                                                   
end = datetime.datetime.now()
run_time = (end - start)
print('run time: ' + str(run_time))                                                                                              

Process data extraction - part 3 - imputed dataframe

In [ ]:
#select populations of interest
pop = pop4

#select fetch windows of interest (for labs and POCT)
selected_fetch_window = 26

#---------------------------------------------------------------------------------------------------------------

start = datetime.datetime.now()
print(pop.name)

# impute missing vital signs and get NEWS score
df_imputed = vitals_complete(df_start)                                       
df_imputed = get_NEWS(df_imputed)
print('vitals imputation and NEWS calculation complete')

end = datetime.datetime.now()
run_time = (end - start)
print('run time: ' + str(run_time))

#----------------------------------------------------------------------------

# add columns recording missingness 

df_imputed = track_missingness(df_imputed)
print('track missingness complete')

end = datetime.datetime.now()
run_time = (end - start)
print('run time: ' + str(run_time))

#----------------------------------------------------------------------------

# impute missing labs and POCT values
df_imputed = impute_labs(df_imputed, df_raw)
print('labs imputation complete')

end = datetime.datetime.now()
run_time = (end - start)
print('run time: ' + str(run_time))

#----------------------------------------------------------------------------

# create a minmax column for all vitals (min-max amplitude in the last 26h)
df_imputed = min_max(df_imputed, df_raw)
print('minmax complete')

end = datetime.datetime.now()
run_time = (end - start)
print('run time: ' + str(run_time))

#----------------------------------------------------------------------------

# create a delta column for all vitals (delta since last measurmement)
df_imputed = delta_vitals(df_imputed,df_raw)
print('delta vitals complete')

end = datetime.datetime.now()
run_time = (end - start)
print('run time: ' + str(run_time))

#----------------------------------------------------------------------------

# create a delta column for all labs + POCT (delta since last measurmement)
df_imputed = delta_labs(df_imputed,df_raw)
print('delta labs complete')

end = datetime.datetime.now()
run_time = (end - start)
print('run time: ' + str(run_time))

#----------------------------------------------------------------------------

print('imputed database complete')
print('number of contact points (rows): ' + str(len(df_imputed.index)))
print('number of hospital admissions: ' + str(len(df_imputed['hadm_id'].unique())))

Process data extraction - part 4 - get outcomes

In [ ]:
#select populations of interest
pop = pop4

# define time window of interest in the future (times are in hours, except time_procedure which is in days, 0 being same day procedures)
icu = 24 # time in hours before ICU admission
arrest = 24 # time in hours before cardiac arrest
death = 24 # time in hours before death
reOP = 24 # time in hours before re-operation
first_day_procedure = 1 # 0 if want to consider same day procedure, 1 otherwise
max_procedure_day = 1 # time in days into the future to consider procedures, 0 being same day procedure, must be >= first_day_procedure
max_prescription_time = 24 # time in hours into the future to consider new prescription

start = datetime.datetime.now()
print(pop.name)

df_complete = get_outcomes(df_imputed, time_icu = icu, time_arrest = arrest, time_death = death, time_reOP = reOP, first_day_procedure = first_day_procedure,
                           max_procedure_day = max_procedure_day, max_prescription_time = max_prescription_time)

print('database with outcome complete')
print('number of contact points (rows): ' + str(len(df_imputed.index)))
print('number of hospital admissions: ' + str(len(df_imputed['hadm_id'].unique())))

end = datetime.datetime.now()
run_time = (end - start)
print('run time: ' + str(run_time))

Process data extraction - part 5 - refine outcomes

In [ ]:
# load the prescription database with drug categories grouping
df_trans_drugs = pd.read_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/list_prescriptions_final.csv')

# load the procedure database with procedures grouping
df_trans_proc = pd.read_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/list_procedures_final.csv')

start = datetime.datetime.now()
print(pop.name)

# pre-process and refine the prescriptions 
df_clustering_ready = refine_prescriptions(df_complete, df_trans_drugs)
print('prescriptions done')

# pre-process and refine the procedures 
df_clustering_ready = refine_procedures(df_clustering_ready, df_trans_proc)
print('procedures done')

print('database with outcome complete')
print('number of contact points (rows): ' + str(len(df_clustering_ready.index)))
print('number of hospital admissions: ' + str(len(df_clustering_ready['hadm_id'].unique())))

end = datetime.datetime.now()
run_time = (end - start)
print('run time: ' + str(run_time))

Sampling and hold out test

In [ ]:
df_analysis = df_clustering_ready.copy()

#select sampling of interest
sampling_meth = 'first'  # define sampling method:
                             # all = all postoperative observations on the ward
                             # allalarm = all observation triggering an alarm postoperatively on the ward,
                             # first = first observation triggering an alarm postoperatively on the ward, 
                             # oneofeach = first observation triggering an alarm postoperatively on the ward + one random observation set for all the other admissions
# in addition for the samplin method "firts" and "oneofeach", only observation set within 30 ays of the last operation are considered.
                
if sampling_meth == 'all':
    df_analysis = df_analysis[(df_analysis['status'] == 2)].copy(deep=True)

elif sampling_meth == 'allalarm':
    df_analysis = df_analysis[(df_analysis['status'] == 2) & (df_analysis['alarm'] == 1)].copy(deep=True)

elif sampling_meth == 'first':    
    df_analysis = df_analysis[(df_analysis['status'] == 2) & (df_analysis['alarm'] == 1) & (df_analysis['post_lastOP_time_hours'] < 720)].drop_duplicates(subset=['hadm_id'], keep='first')

elif sampling_meth == 'oneofeach':
    # select the first observation set triggerng an alarm for each hospital admission triggering at least one alarm
    df_analysis_1 = df_analysis[(df_analysis['status'] == 2) & (df_analysis['alarm'] == 1) & (df_analysis['post_lastOP_time_hours'] < 720)].drop_duplicates(subset=['hadm_id'], keep='first')
    hadm_alarm = list(df_analysis_1['hadm_id'].unique())  
    # select one random observation set in each of the remaining admissions
    df_analysis_2 = df_analysis[(~df_analysis['hadm_id'].isin(hadm_alarm)) & (df_analysis['status'] == 2) & (df_analysis['alarm'] == 0) & (df_analysis['post_lastOP_time_hours'] < 720)].sample(frac=1, random_state = 2206).drop_duplicates(subset=['hadm_id'], keep='first')
    df_analysis = pd.concat([df_analysis_1, df_analysis_2])

else:
    print("error sampling method")
    
    
# hold out test set (df_test)
X_gss = df_analysis
y_gss = df_analysis.hospital_code # only to allow function to run
group = df_analysis.subject_id # prevent data leakage from same patient with several hospital admission
gss = GroupShuffleSplit(n_splits=2, train_size=.8, random_state=2206)
gss.get_n_splits()
train_ix, test_ix = next(gss.split(X_gss, y_gss, groups=group))

df_train = X_gss.iloc[train_ix].copy()
df_test = X_gss.iloc[test_ix].copy()
df_train.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)

print('length index (contact points) after sampling: ' + str(len(df_analysis.index)))
print('from ' + str(len(df_analysis['hadm_id'].unique())) + ' hospital admissions' )
print('representing ' + str(len(df_analysis['subject_id'].unique())) + 'patients')

print('length index train set: ' + str(len(df_train.index)))
print('from ' + str(len(df_train['hadm_id'].unique())) + ' hospital admissions' )
print('representing ' + str(len(df_train['subject_id'].unique())) + 'patients')

print('length index test set: ' + str(len(df_test.index)))
print('from ' + str(len(df_test['hadm_id'].unique())) + ' hospital admissions' )
print('representing ' + str(len(df_test['subject_id'].unique())) + 'patients')
print('sampling completed')

Clustering and analysis

Dataset description

In [ ]:
def characteristics(df, marker):
    print(marker)
    n_patients=len(df['subject_id'].unique())
    n_admissions=len(df['hadm_id'].unique())
    n_observations=len(df)
    n_alarms=len(df[df['alarm']==1])
    n_events=len(df[df['event']==1])
    PPV=len(df[(df['alarm']==1) & (df['event']==1)])/len(df[df['alarm']==1])
    sensitivity=len(df[(df['alarm']==1) & (df['event']==1)])/len(df[df['event']==1])
    n_hadm_events=len(df[df['event']==1]['hadm_id'].unique())
    n_hadm_arrest=len(df[df['arrest_event']==1]['hadm_id'].unique())
    n_hadm_ICU=len(df[df['icu_event']==1]['hadm_id'].unique())
    n_hadm_death=len(df[df['mortality_event']==1]['hadm_id'].unique())
    n_hadm_reOP=len(df[df['reOP_event']==1]['hadm_id'].unique())
    age=df.drop_duplicates('hadm_id').age.median()
    age25, age75=np.percentile(df.drop_duplicates('hadm_id').age, [25 ,75])
    cci=df.drop_duplicates('hadm_id').cci.median()
    cci25, cci75=np.percentile(df.drop_duplicates('hadm_id').cci, [25 ,75])
    print('n_patients = ' + str(n_patients))
    print('n_admissions = ' + str(n_admissions))
    print('n_observations = ' + str(n_observations))
    print('n_alarms = ' + str(n_alarms))
    print('n_events = ' + str(n_events))
    print('PPV = ' + str(PPV))
    print('sensitivity = ' + str(sensitivity))
    print('n_hadm_events = ' + str(n_hadm_events))
    print('n_hadm_arrest = ' + str(n_hadm_arrest))
    print('n_hadm_ICU = ' + str(n_hadm_ICU))
    print('n_hadm_death = ' + str(n_hadm_death))
    print('n_hadm_reOP = ' + str(n_hadm_reOP))
    print('age = ' + str(age))
    print('age25 = ' + str(age25))
    print('age75 = ' + str(age75))
    print('cci = ' + str(cci))
    print('cci25 = ' + str(cci25))
    print('cci75 = ' + str(cci75))
    print('---------------------------------')
    
hadm_event = list(df_analysis[df_analysis['event'] == 1]['hadm_id'].unique())
df_analysis_1 = df_analysis[df_analysis['hadm_id'].isin(hadm_event)]
df_analysis_2 = df_analysis[(df_analysis['status'] == 2) & (df_analysis['alarm'] == 1) & (df_analysis['post_lastOP_time_hours'] < 720)].drop_duplicates(subset=['hadm_id'], keep='first')
df_analysis_3 = df_analysis_2[(df_analysis_2['event'] == 1)]

df_analysis_ideal = df_analysis[(df_analysis['status'] == 2) & (df_analysis['alarm'] == 1) & (df_analysis['event'] == 1) & (df_analysis['post_lastOP_time_hours'] < 720)].drop_duplicates(subset=['hadm_id'], keep='first')
df_analysis_test = df_analysis[(df_analysis['status'] == 2) & (df_analysis['alarm'] == 1) & (df_analysis['post_lastOP_time_hours'] < 720)].sample(frac=1).drop_duplicates(subset=['hadm_id'], keep='first')

characteristics(df_analysis, 'All')
characteristics(df_analysis_1, 'At least 1 event')
characteristics(df_analysis_2, 'first NEWS')
characteristics(df_analysis_3, 'first NEWS with event')
characteristics(df_analysis_test, 'random selection')
characteristics(df_analysis_ideal, 'ideal')

Prescriptions et procedures description

In [ ]:
# prescriptions
ax=df_analysis['number_prescriptions_groups'].hist(bins=15)
ax.set_title('Distribution of the number of new prescription within 24 hours \n for '+pop.description)
ax.set_xlabel('number of new prescriptions')
ax.set_ylabel('number of observation sets')
plt.show()

unique_set = set()
for i in df_analysis.index:
    index_pres = df_analysis.new_prescriptions_grouped[i]
    new_pres = set(index_pres) - unique_set
    unique_set.update(new_pres)

# number and name of unique prescription groups in the sampled observations
print('groups present in the sampling of interest: ' + str(len(unique_set)))
print(unique_set)

# number and details of prescription regimes in the sampled observations
li = list(df_analysis.new_prescriptions_grouped)
count = Counter(str(e) for e in li)
print('\nOccurence of prescription groups combinations: ' + str(len(count)))
print(count)
print(df_analysis['number_prescriptions_groups'].describe())
In [ ]:
# procedures
ax=df_analysis['number_procedures_groups'].hist(bins=15)
ax.set_title('Distribution of the number of new procedure within 24 hours \n for the sampled contact points')
ax.set_xlabel('number of new procedures')
ax.set_ylabel('number of observation sets')
plt.show()

unique_set = set()
for i in df_analysis.index:
    index_proc = df_analysis.new_procedures_grouped[i]
    new_proc = set(index_proc) - unique_set
    unique_set.update(new_proc)

# number and name of unique procedure groups in the sampled observations
print('groups present in the sampling of interest: ' + str(len(unique_set)))
print(unique_set)

# number and details of procedure regimes in the sampled observations
li = list(df_analysis.new_procedures_grouped)
count = Counter(str(e) for e in li)
print('\nOccurence of procedure groups combinations: ' + str(len(count)))
print(count)
print(df_analysis['number_procedures_groups'].describe())

Define input features

All analyses below should be repeated for each input features set

In [ ]:
# select method of input selection: 
#"all" = all features
#"missingness" = all vital signs + all admin data + all labs with less than the specified missingness rate
#"vitals" = only vital signs (same as in NEWS)
#"hand_picked" = selected list of variables based on the author clinical experience

method = 'missingness'

# max % of missing values (only if method = missingness)
max_missing = 66.66 

# define input features
input_features = ['HR','RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU', 'ESTIMATED_FIO2_2', 'ALB', 'ALP', 'ALT', 'APTT', 'AST', 'BAS', 
           'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP','COCA', 'EGFR', 'EOS', 'GLU', 'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM', 'MCH', 'MCHC', 'MCV', 'METHB', 
           'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD', 'TBIL', 'TROP', 'UR', 'WBC','age', 'cci','post_firstOP_time_hours', 'post_lastOP_time_hours']

if method == 'all':
    variables = input_features

elif method == 'missingness':
    # calculate the rate of missing data for each feature (considered missing if no value in the 26h preceeding the contact point), only apply to lab variables
    missing = df_analysis[['ALB_miss', 'ALP_miss', 'ALT_miss', 'APTT_miss', 'AST_miss', 'BAS_miss', 'BASEX_miss', 'BICAR_miss', 'CAL_miss', 'CL_miss', 
                         'CR_miss', 'CRP_miss', 'COCA_miss', 'EGFR_miss', 'EOS_miss', 'GLU_miss', 'HCT_miss', 'HGB_miss', 'INR_miss', 'KET_miss', 
                         'LAC_miss', 'LYM_miss', 'MCH_miss', 'MCHC_miss', 'MCV_miss', 'METHB_miss', 'MON_miss', 'MPV_miss', 'NEU_miss', 'OSM_miss', 'PH_miss', 
                         'PLT_miss', 'POT_miss', 'RBC_miss', 'SOD_miss', 'TBIL_miss', 'TROP_miss', 'UR_miss', 'WBC_miss']].mean()*100

    variables_ = missing[missing>(100-max_missing)].index.to_list()
    variables=[]
    for var_ in variables_:
        var = var_.replace('_miss', '')
        variables.append(var)
    variables = ['HR','RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU', 'ESTIMATED_FIO2_2'] + variables + ['age', 'cci','post_firstOP_time_hours', 'post_lastOP_time_hours']

elif method == 'hand_picked':
    variables = ['HR','RR', 'SBP', 'SPO2', 'TEMP', 'ESTIMATED_FIO2_2', 'ALB', 'ALP', 'ALT', 'APTT', 'AST','BICAR', 'CR', 'CRP', 'EGFR', 'GLU', 'HCT', 'HGB', 'INR', 
                 'LAC', 'PH', 'PLT', 'POT', 'SOD', 'TBIL', 'WBC','age', 'cci','post_firstOP_time_hours', 'post_lastOP_time_hours']
    
elif method == 'vitals':
    
    variables = ['HR','RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU', 'ESTIMATED_FIO2_2']
    
else:
    print('input features selection error')

# add delta to lab values in variables
for var in variables:
    if var in ['ALB', 'ALP', 'ALT', 'APTT', 'AST', 'BAS', 'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP','COCA', 'EGFR', 'EOS', 'GLU', 'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM', 
               'MCH', 'MCHC', 'MCV', 'METHB', 'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD', 'TBIL', 'TROP', 'UR', 'WBC']:
        var0 = var + '_delta'
        variables.append(var0)

# add delta and minmax to vitals in variables
for vitals in variables:
    if vitals in ['HR','RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU', 'ESTIMATED_FIO2_2']:
        vitals_minmax = vitals + '_minmax'
        variables.append(vitals_minmax)
        vitals_delta = vitals + '_delta'
        variables.append(vitals_delta)

print(variables)
print('initial dimensionality: ' + str(len(variables)))

Autoencoder parameters tuning

In [ ]:
# describe autoencoder parameters to optimise
N_layers = [2, 4, 8, 16]
N_neurons = [64, 128, 256, 512]
Latent_dimension = [4, 8, 16, 32]

results = np.zeros((len(N_layers), len(N_neurons), len(Latent_dimension)))
np.random.seed(1301)
random.set_seed(2206)

start=datetime.datetime.now()

for n_layers in N_layers:
    print('+++++++++++++++')
    pos_layer = N_layers.index(n_layers)
    for n_neurons in N_neurons:
        print('------------')
        pos_neuron = N_neurons.index(n_neurons)
        for latent_dimension in Latent_dimension:
            pos_latent = Latent_dimension.index(latent_dimension)
            
            print([n_layers, n_neurons, latent_dimension])
            
            vector_res = []
            
            # kfold crossvalidation
            X_gss = df_train
            y_gss = df_train.hospital_code # only to allow function to run
            group = df_train.subject_id # prevent data leakage from same patient with several hospital admission
            group_kfold = GroupKFold(n_splits=8) # equivalent to a 70:10:20 split overall
            group_kfold.get_n_splits()
            for train_index, val_index in group_kfold.split(X_gss, y_gss, groups = group):
                
                df_opti = X_gss.iloc[train_index]
                df_val = X_gss.iloc[val_index]
            
                X = df_opti[variables]
                X_val = df_val[variables]
                dimension = len(variables)

                # standardisation
                X_unscaled = X.to_numpy()
                X_scaled = StandardScaler().fit_transform(X_unscaled)
                X_val_unscaled = X_val.to_numpy()
                X_val_scaled = StandardScaler().fit_transform(X_val_unscaled)
            
                # define encoder architecture
                input_data = keras.Input(shape=(dimension,))
                encoded = layers.Dense(n_neurons, activation='relu')(input_data)
                for encoder_layer in range(1, n_layers): # enable optimisation on the number of layers
                    encoded = layers.Dense(n_neurons, activation='relu')(encoded)
                    
                # latent space    
                encoded = layers.Dense(latent_dimension, activation='relu')(encoded)

                # define decoder architecture
                decoded = layers.Dense(n_neurons, activation='relu')(encoded)         
                for decoder_layer in range(1, n_layers): # idem
                    decoded = layers.Dense(n_neurons, activation='relu')(decoded)
                decoded = layers.Dense(dimension)(decoded)

                autoencoder = keras.Model(input_data, decoded)
                autoencoder.compile(optimizer='adam', loss='mean_absolute_error')
                
                # fit autoencoder
                AE = autoencoder.fit(X_scaled, X_scaled, epochs=100, batch_size=32, shuffle=True,
                                validation_data=(X_val_scaled, X_val_scaled), verbose = 0)

                # update results matrix
                res_ = AE.history['val_loss'][-1]
                vector_res.append(res_)
            
            res = np.mean(vector_res)
            print(res)
            results[pos_layer, pos_neuron, pos_latent] = res

best_param = np.argwhere(results == np.amin(results))
print(results)
print('Best parameters and loss')
print(best_param[0])
print(results[(best_param[0][0],best_param[0][1],best_param[0][2])])

end=datetime.datetime.now()
print('runtime: ' + str(end-start))

# save optimisation table if needed
results_ = results
names = ['x', 'y', 'z']
index = pd.MultiIndex.from_product([range(s)for s in results_.shape], names=names)
df = pd.DataFrame({'A': results_.flatten()}, index=index)['A']
df = df.unstack(level='z').sort_index()
df.columns = ['Latent_4', 'Latent_8', 'Latent_16', 'Latent_32']
df.index.names = ['Layer', 'Neuron']
print(df)

# save the parameter optimisation table if needed
df.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_optimisation_first_vitals.csv')

Cluster number optimisation (k-means): elbow method and silhouette score - supplementary materials only

In [ ]:
start=datetime.datetime.now()

#dim_reduction = ['none', 'PCA', 'AE']
#optimisation_method = ['elbow', 'silhouette']

# define the range of cluster number, latent dimensions (autoencoder), and variances explained by the principal components to include in the optimisation
number_clusters = range(2,31)
latent_dimension = range(int(len(variables)/5), len(variables), int(len(variables)/5))
n_components = [0.75, 0.8, 0.85, 0.9, 0.95]

np.random.seed(1301)
random.set_seed(2206)

def f_cluster_opti(variables, dim, method, n_components, n_latdim, n_clusters, df_train):
    # select optimal autoencoder architecture according to the input features set (see autoencoder parameter tuning)
    n_neurons = 128
    n_layers = 2    
    score_method = []
    
    df_results = pd.DataFrame(index = n_clusters)
    X = df_train[variables]

    # standardisation
    X_unscaled = X.to_numpy()
    X_scaled = StandardScaler().fit_transform(X_unscaled)

    # dimensionality reduction
    if dim == 'PCA':
        pca = PCA(n_components=n_components, svd_solver='full')
        X_latent = pca.fit_transform(X_scaled)

    elif dim == 'AE':
        dimension = len(variables)
        batch = 32

        # define encoder architecture
        input_data = keras.Input(shape=(dimension,))
        encoded = layers.Dense(n_neurons, activation='relu')(input_data)
        for encoder_layer in range(1, n_layers): 
            encoded = layers.Dense(n_neurons, activation='relu')(encoded)  
        
        # latent space
        encoded = layers.Dense(n_latdim, activation='relu')(encoded)

        # define decoder architecture
        decoded = layers.Dense(n_neurons, activation='relu')(encoded)         
        for decoder_layer in range(1, n_layers): 
            decoded = layers.Dense(n_neurons, activation='relu')(decoded)
        decoded = layers.Dense(dimension)(decoded)

        autoencoder = keras.Model(input_data, decoded)
        encoder = keras.Model(input_data, encoded)
        autoencoder.compile(optimizer='adam', loss='mean_absolute_error')
        
        # fit autoencoder
        autoencoder.fit(X_scaled, X_scaled, epochs=100, batch_size=batch, shuffle=True, verbose = 0)
        X_latent = encoder.predict(X_scaled)

    elif dim == 'none':
        X_latent = X_scaled

    else:
        print('dimensionality reduction error')

    # cluster number optimisation method
    results_cluster = []
    for n in n_clusters:

        if method == 'elbow':
            seed_kmeans = np.random.randint(1,1001) # increase cluster search diversity
            kmeansModel = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100, n_clusters=n, n_init=10, 
            random_state=seed_kmeans, tol=0.0001, verbose=0)
            results_opti = kmeansModel.fit_predict(X_latent)
            score_ = kmeansModel.inertia_

        elif method == 'silhouette':
            seed_kmeans = np.random.randint(1,1001) # increase cluster search diversity
            kmeansModel = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100, n_clusters=n, n_init=10, 
            random_state=seed_kmeans, tol=0.0001, verbose=0)
            results_opti = kmeansModel.fit_predict(X_latent)
            score_ = metrics.silhouette_score(X_latent, results_opti, metric='euclidean')

        else:
            print('evaluation method error')

        results_cluster.append(score_)
 
    return results_cluster      
    
#--------------------------------------------------------------------------------------------------------

# get the inertia values (elbow method) and silhouette scores for each combination of k-means and dimensionality reduction method

df_opti_none_elbow = pd.DataFrame(index = number_clusters, columns = ['none_elbow'])
new_col = f_cluster_opti(variables, dim='none', method = 'elbow', n_components = 'not needed', n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_none_elbow['none_elbow'] = new_col
print('df_opti_none_elbow done')

df_opti_none_silhouette = pd.DataFrame(index = number_clusters, columns = ['none_silhouette'])
new_col = f_cluster_opti(variables, dim='none', method = 'silhouette', n_components = 'not needed', n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_none_silhouette['none_silhouette'] = new_col
print('df_opti_none_silhouette done')

df_opti_PCA_elbow = pd.DataFrame(index = number_clusters, columns = ['PCA_elbow_0', 'PCA_elbow_1', 'PCA_elbow_2', 'PCA_elbow_3', 'PCA_elbow_4'])
for exvar, j in zip(n_components, [0, 1, 2, 3, 4]):
    new_col = f_cluster_opti(variables, dim='PCA', method = 'elbow', n_components = exvar, n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
    df_opti_PCA_elbow['PCA_elbow_'+str(j)] = new_col
print('df_opti_PCA_elbow done')
    
df_opti_PCA_silhouette = pd.DataFrame(index = number_clusters, columns = ['PCA_silhouette_0', 'PCA_silhouette_1', 'PCA_silhouette_2', 'PCA_silhouette_3', 'PCA_silhouette_4'])
for exvar, j in zip(n_components, [0, 1, 2, 3, 4]):
    new_col = f_cluster_opti(variables, dim='PCA', method = 'silhouette', n_components = exvar, n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
    df_opti_PCA_silhouette['PCA_silhouette_'+str(j)] = new_col
print('df_opti_PCA_silhouette done')
    
df_opti_AE_elbow = pd.DataFrame(index = number_clusters, columns = ['AE_elbow_0', 'AE_elbow_1', 'AE_elbow_2', 'AE_elbow_3', 'AE_elbow_4'])
for latdim, j in zip(latent_dimension, [0, 1, 2, 3, 4]):
    new_col = f_cluster_opti(variables, dim='AE', method = 'elbow', n_components = 'not needed', n_latdim = latdim, n_clusters = number_clusters, df_train = df_train)
    df_opti_AE_elbow['AE_elbow_'+str(j)] = new_col
print('df_opti_AE_elbow done')
    
df_opti_AE_silhouette = pd.DataFrame(index = number_clusters, columns = ['AE_silhouette_0', 'AE_silhouette_1', 'AE_silhouette_2', 'AE_silhouette_3', 'AE_silhouette_4'])
for latdim, j in zip(latent_dimension, [0, 1, 2, 3, 4]):
    new_col = f_cluster_opti(variables, dim='AE', method = 'silhouette', n_components = 'not needed', n_latdim = latdim, n_clusters = number_clusters, df_train = df_train)
    df_opti_AE_silhouette['AE_silhouette_'+str(j)] = new_col
print('df_opti_AE_silhouette done')
    
# save results for reuse in graph production if needed
df_opti_none_elbow.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_none_elbow_first_missingness.csv')
df_opti_none_silhouette.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_none_silhouette_first_missingness.csv')
df_opti_PCA_elbow.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_PCA_elbow_first_missingness.csv')
df_opti_PCA_silhouette.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_PCA_silhouette_first_missingness.csv')
df_opti_AE_elbow.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_AE_elbow_first_missingness.csv')
df_opti_AE_silhouette.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_AE_silhouette_first_missingness.csv')

# create data visualisation graphs
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(15,20))
fig.subplots_adjust(hspace=0.4)
axs=axs.ravel()

axs[0].plot(df_opti_none_elbow['none_elbow'])
axs[0].set_title('k-means without dimensionality reduction,\n elbow method', fontsize = 16, pad = 15)
axs[0].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[0].set_ylabel('Inertia', fontsize = 12, labelpad = 10)

axs[1].plot(df_opti_none_silhouette['none_silhouette'])
axs[1].set_title('k-means without dimensionality reduction,\n silhouette score', fontsize = 16, pad = 15)
axs[1].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[1].set_ylabel('Silhouette score', fontsize = 12, labelpad = 10)

axs[2].plot(df_opti_PCA_elbow[['PCA_elbow_0', 'PCA_elbow_1', 'PCA_elbow_2', 'PCA_elbow_3', 'PCA_elbow_4']])
axs[2].set_title('PCA + k-means,\n elbow method', fontsize = 16, pad = 15)
axs[2].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[2].set_ylabel('Inertia', fontsize = 12, labelpad = 10)
axs[2].legend(['exvar: '+str(n_components[0]), 'exvar: '+str(n_components[1]), 'exvar: '+str(n_components[2]), 'exvar: '+str(n_components[3]), 'exvar: '+str(n_components[4])], loc = 'upper right')

axs[3].plot(df_opti_PCA_silhouette[['PCA_silhouette_0', 'PCA_silhouette_1', 'PCA_silhouette_2', 'PCA_silhouette_3', 'PCA_silhouette_4']])
axs[3].set_title('PCA + k-means,\n silhouette score', fontsize = 16, pad = 15)
axs[3].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[3].set_ylabel('Silhouette score', fontsize = 12, labelpad = 10)
axs[3].legend(['exvar: '+str(n_components[0]), 'exvar: '+str(n_components[1]), 'exvar: '+str(n_components[2]), 'exvar: '+str(n_components[3]), 'exvar: '+str(n_components[4])], loc = 'upper right')

axs[4].plot(df_opti_AE_elbow[['AE_elbow_0', 'AE_elbow_1', 'AE_elbow_2', 'AE_elbow_3', 'AE_elbow_4']])
axs[4].set_title('Autoencoder + k-means,\n elbow method', fontsize = 16, pad = 15)
axs[4].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[4].set_ylabel('Inertia', fontsize = 12, labelpad = 10)
axs[4].legend(['latdim: '+str(latent_dimension[0]), 'latdim: '+str(latent_dimension[1]), 'latdim: '+str(latent_dimension[2]), 'latdim: '+str(latent_dimension[3]), 'latdim: '+str(latent_dimension[4])], loc = 'upper right')

axs[5].plot(df_opti_AE_silhouette[['AE_silhouette_0', 'AE_silhouette_1', 'AE_silhouette_2', 'AE_silhouette_3', 'AE_silhouette_4']])
axs[5].set_title('Autoencoder + k-means,\n silhouette score', fontsize = 16, pad = 15)
axs[5].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[5].set_ylabel('Silhouette score', fontsize = 12, labelpad = 10)
axs[5].legend(['latdim: '+str(latent_dimension[0]), 'latdim: '+str(latent_dimension[1]), 'latdim: '+str(latent_dimension[2]), 'latdim: '+str(latent_dimension[3]), 'latdim: '+str(latent_dimension[4])], loc = 'upper right')

plt.savefig('C:/Users/baptiste.vasey/Documents/Thesis/Figures/SuppFigElbowSilhouette_first_missingness_' + pop.name + '.pdf', bbox_inches = 'tight')
plt.show()

Cluster number optimisation (k-means and hierarchical clustering): score -log(1-P(i)) - supplementary materials only

Values for hierarchical clustering are dysplayed for comparison only - a different cluster optimisation method was used in practice

In [ ]:
start=datetime.datetime.now()

# custom score for cluster number optimisation using the distribution of distances from cluster centroids (abnormality detection). Should be minimised.
def clustering_score(df_opti, df_val, variables, centroids, X_opti_latent, X_val_latent):
    
    def probability_function(cluster, centroids, target, df_opti, X_opti_latent):
        df_cluster = df_opti[df_opti['cluster']==cluster]
        d = np.square(X_opti_latent[df_cluster.index] - centroids[cluster]).sum(axis=1) # Euclidean distance
        d_target = np.square(target - centroids[cluster]).sum()
        density_below = len(d[d<d_target])/len(d)
        score_density = -np.log(1-density_below+0.01) # 0.01 added to avoid log(0)=inf
        return score_density
    
    score = []
    for i in df_val.index:  
        cluster = df_val.loc[i, 'cluster']
        score_ = probability_function(cluster, centroids, X_val_latent[i], df_opti, X_opti_latent)
        score.append(score_)

    final_score = (np.mean(score))
    
    return final_score

#dim_reduction = ['none', 'PCA', 'AE']
#clustering_method = ['kmeans', 'hierarchical']

# define the range of cluster number, latent dimensions (autoencoder), and variances explained by the principal components to include in the optimisation
number_clusters = range(2,31)
latent_dimension = range(int(len(variables)/5), len(variables), int(len(variables)/5))
n_components = [0.75, 0.8, 0.85, 0.9, 0.95]

np.random.seed(1301)
random.set_seed(2206)

def f_cluster_opti(variables, dim, method, n_components, n_latdim, n_clusters, df_train):
    # select optimal autoencoder architecture according to the input features set (see autoencoder parameter tuning)
    n_neurons = 64
    n_layers = 2    
    score_method = []

    # kfold crossvalidation
    X_gss = df_train
    y_gss = df_train.number_prescriptions_groups # only to allow function to run
    group = df_train.subject_id # prevent data leakage from same patient with several hospital admission
    group_kfold = GroupKFold(n_splits=8) # equivalent to a 70:10:20 split overall
    group_kfold.get_n_splits()
    
    df_results = pd.DataFrame(index = n_clusters)
    k = 1

    for opti_index, val_index in group_kfold.split(X_gss, y_gss, groups = group):
        df_opti = X_gss.iloc[opti_index].copy()
        df_val = X_gss.iloc[val_index].copy()
        df_opti.reset_index(inplace=True, drop=True)
        df_val.reset_index(inplace=True, drop=True)
        X = df_opti[variables]
        X_val = df_val[variables]

        # standardisation
        X_unscaled = X.to_numpy()
        X_scaled = StandardScaler().fit_transform(X_unscaled)
        X_val_unscaled = X_val.to_numpy()
        X_val_scaled = StandardScaler().fit_transform(X_val_unscaled)

        # dimensionality reduction
        if dim == 'PCA':
            pca = PCA(n_components=n_components, svd_solver='full')
            X_latent = pca.fit_transform(X_scaled)
            X_val_latent = pca.transform(X_val_scaled)

        elif dim == 'AE':
            dimension = len(variables)
            batch = 32

            # define encoder architecture
            input_data = keras.Input(shape=(dimension,))
            encoded = layers.Dense(n_neurons, activation='relu')(input_data)
            for encoder_layer in range(1, n_layers): 
                encoded = layers.Dense(n_neurons, activation='relu')(encoded)  
            
            # latent space
            encoded = layers.Dense(n_latdim, activation='relu')(encoded)

            # define decoder architecture
            decoded = layers.Dense(n_neurons, activation='relu')(encoded)         
            for decoder_layer in range(1, n_layers): 
                decoded = layers.Dense(n_neurons, activation='relu')(decoded)
            decoded = layers.Dense(dimension)(decoded)

            autoencoder = keras.Model(input_data, decoded)
            encoder = keras.Model(input_data, encoded)
            autoencoder.compile(optimizer='adam', loss='mean_absolute_error')
            
            # fit autoencoder
            autoencoder.fit(X_scaled, X_scaled, epochs=100, batch_size=batch, shuffle=True, verbose = 0)
            X_latent = encoder.predict(X_scaled)
            X_val_latent = encoder.predict(X_val_scaled)

        elif dim == 'none':
            X_latent = X_scaled
            X_val_latent = X_val_scaled

        else:
            print('dimensionality reduction error')

        # clustering
        results_cluster = []
        for n in n_clusters:

            if method == 'kmeans':
                seed_kmeans = np.random.randint(1,1001) # increase cluster search diversity
                kmeansModel = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100, n_clusters=n, n_init=10, 
                random_state=seed_kmeans, tol=0.0001, verbose=0)
                results_opti = kmeansModel.fit_predict(X_latent)
                results_val = kmeansModel.predict(X_val_latent)
                df_opti.loc[:,'cluster'] = results_opti.tolist()
                df_val.loc[:,'cluster'] = results_val.tolist()
                centroids = kmeansModel.cluster_centers_
                score_clust = clustering_score(df_opti, df_val, variables, centroids, X_latent, X_val_latent)

            elif method == 'hierarchical':
                h_clust = AgglomerativeClustering(n_clusters = n, affinity = 'euclidean', linkage ='ward')
                results_opti = h_clust.fit_predict(X_latent)
                df_opti.loc[:,'cluster'] = results_opti.tolist()
                for i in df_val.index:
                    dist = np.square(X_latent - X_val_latent[i]).sum(axis=1)
                    df_opti['dist']=dist
                    target_cluster = np.argmin(df_opti.groupby(['cluster'])['dist'].mean())
                    df_val.loc[i,'cluster'] = target_cluster
                df_val['cluster'] = df_val['cluster'].apply(np.int64)
                
                # calculate cluster centroids
                points = X_latent
                labels = results_opti
                clf = NearestCentroid()
                clf.fit(points, labels)
                centroids = clf.centroids_
                
                # calculate performance
                score_clust = clustering_score(df_opti, df_val, variables, centroids, X_latent, X_val_latent)
                
            else:
                print('clustering model error')

            results_cluster.append(score_clust)
        
        df_results['split_'+str(k)] = results_cluster
        print(k)
        k = k+1
        
    results = df_results.mean(axis=1)
    
    return results        
    
#--------------------------------------------------------------------------------------------------------

# get the mean log score values for each combination of dimensionality reduction and clustering methods

df_opti_none_kmeans = pd.DataFrame(index = number_clusters, columns = ['none_kmeans'])
new_col = f_cluster_opti(variables, dim='none', method = 'kmeans', n_components = 'not needed', n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_none_kmeans['none_kmeans'] = new_col
print('df_opti_none_kmeans done')

df_opti_none_hierarchical = pd.DataFrame(index = number_clusters, columns = ['none_hierarchical'])
new_col = f_cluster_opti(variables, dim='none', method = 'hierarchical', n_components = 'not needed', n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_none_hierarchical['none_hierarchical'] = new_col
print('df_opti_none_hierarchical done')

df_opti_PCA_kmeans = pd.DataFrame(index = number_clusters, columns = ['PCA_kmeans_0', 'PCA_kmeans_1', 'PCA_kmeans_2', 'PCA_kmeans_3', 'PCA_kmeans_4'])
for exvar, j in zip(n_components, [0, 1, 2, 3, 4]):
    new_col = f_cluster_opti(variables, dim='PCA', method = 'kmeans', n_components = exvar, n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
    df_opti_PCA_kmeans['PCA_kmeans_'+str(j)] = new_col
print('df_opti_PCA_kmeans done')
    
df_opti_PCA_hierarchical = pd.DataFrame(index = number_clusters, columns = ['PCA_hierarchical_0', 'PCA_hierarchical_1', 'PCA_hierarchical_2', 'PCA_hierarchical_3', 'PCA_hierarchical_4'])
for exvar, j in zip(n_components, [0, 1, 2, 3, 4]):
    new_col = f_cluster_opti(variables, dim='PCA', method = 'hierarchical', n_components = exvar, n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
    df_opti_PCA_hierarchical['PCA_hierarchical_'+str(j)] = new_col
print('df_opti_PCA_hierarchical done')
    
df_opti_AE_kmeans = pd.DataFrame(index = number_clusters, columns = ['AE_kmeans_0', 'AE_kmeans_1', 'AE_kmeans_2', 'AE_kmeans_3', 'AE_kmeans_4'])
for latdim, j in zip(latent_dimension, [0, 1, 2, 3, 4]):
    new_col = f_cluster_opti(variables, dim='AE', method = 'kmeans', n_components = 'not needed', n_latdim = latdim, n_clusters = number_clusters, df_train = df_train)
    df_opti_AE_kmeans['AE_kmeans_'+str(j)] = new_col
print('df_opti_AE_kmeans done')

df_opti_AE_hierarchical = pd.DataFrame(index = number_clusters, columns = ['AE_hierarchical_0', 'AE_hierarchical_1', 'AE_hierarchical_2', 'AE_hierarchical_3', 'AE_hierarchical_4'])
for latdim, j in zip(latent_dimension, [0, 1, 2, 3, 4]):
    new_col = f_cluster_opti(variables, dim='AE', method = 'hierarchical', n_components = 'not needed', n_latdim = latdim, n_clusters = number_clusters, df_train = df_train)
    df_opti_AE_hierarchical['AE_hierarchical_'+str(j)] = new_col
print('df_opti_AE_hierarchical done')
    
# save results for reuse in graph production if needed
df_opti_none_kmeans.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreLog_none_kmeans_first_vitals.csv')
df_opti_none_hierarchical.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreLog_none_hierarchical_first_vitals.csv')
df_opti_PCA_kmeans.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreLog_PCA_kmeans_first_vitals.csv')
df_opti_PCA_hierarchical.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreLog_PCA_hierarchical_first_vitals.csv')
df_opti_AE_kmeans.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreLog_AE_kmeans_first_vitals.csv')
df_opti_AE_hierarchical.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreLog_AE_hierarchical_first_vitals.csv')

# create data visualisation graphs
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(15,20))
fig.subplots_adjust(hspace=0.4)
axs=axs.ravel()

axs[0].plot(df_opti_none_kmeans['none_kmeans'])
axs[0].set_title('k-means without \ndimensionality reduction', fontsize = 16, pad = 15)
axs[0].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[0].set_ylabel('Custom score value', fontsize = 12, labelpad = 10)
#axs[0].legend(['basic', 'order', 'penalty'], loc = 'upper left')

axs[1].plot(df_opti_none_hierarchical['none_hierarchical'])
axs[1].set_title('hierarchical clustering without \ndimensionality reduction', fontsize = 16, pad = 15)
axs[1].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[1].set_ylabel('Custom score value', fontsize = 12, labelpad = 10)
#axs[1].legend(['basic', 'order', 'penalty'], loc = 'upper left')

axs[2].plot(df_opti_PCA_kmeans[['PCA_kmeans_0', 'PCA_kmeans_1', 'PCA_kmeans_2', 'PCA_kmeans_3', 'PCA_kmeans_4']])
axs[2].set_title('PCA + k-means', fontsize = 16, pad = 15)
axs[2].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[2].set_ylabel('Custom score value', fontsize = 12, labelpad = 10)
axs[2].legend(['exvar: '+str(n_components[0]), 'exvar: '+str(n_components[1]), 'exvar: '+str(n_components[2]), 'exvar: '+str(n_components[3]), 'exvar: '+str(n_components[4])], loc = 'upper left')

axs[3].plot(df_opti_PCA_hierarchical[['PCA_hierarchical_0', 'PCA_hierarchical_1', 'PCA_hierarchical_2', 'PCA_hierarchical_3', 'PCA_hierarchical_4']])
axs[3].set_title('PCA + hierarchical clustering', fontsize = 16, pad = 15)
axs[3].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[3].set_ylabel('Custom score value', fontsize = 12, labelpad = 10)
axs[3].legend(['exvar: '+str(n_components[0]), 'exvar: '+str(n_components[1]), 'exvar: '+str(n_components[2]), 'exvar: '+str(n_components[3]), 'exvar: '+str(n_components[4])], loc = 'upper left')

axs[4].plot(df_opti_AE_kmeans[['AE_kmeans_0', 'AE_kmeans_1', 'AE_kmeans_2', 'AE_kmeans_3', 'AE_kmeans_4']])
axs[4].set_title('Autoencoder + k-means', fontsize = 16, pad = 15)
axs[4].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[4].set_ylabel('Custom score value', fontsize = 12, labelpad = 10)
axs[4].legend(['latdim: '+str(latent_dimension[0]), 'latdim: '+str(latent_dimension[1]), 'latdim: '+str(latent_dimension[2]), 'latdim: '+str(latent_dimension[3]), 'latdim: '+str(latent_dimension[4])], loc = 'upper left')

axs[5].plot(df_opti_AE_hierarchical[['AE_hierarchical_0', 'AE_hierarchical_1', 'AE_hierarchical_2', 'AE_hierarchical_3', 'AE_hierarchical_4']])
axs[5].set_title('Autoencoder + hierarchical clustering', fontsize = 16, pad = 15)
axs[5].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[5].set_ylabel('Custom score value', fontsize = 12, labelpad = 10)
axs[5].legend(['latdim: '+str(latent_dimension[0]), 'latdim: '+str(latent_dimension[1]), 'latdim: '+str(latent_dimension[2]), 'latdim: '+str(latent_dimension[3]), 'latdim: '+str(latent_dimension[4])], loc = 'upper left')

plt.savefig('C:/Users/baptiste.vasey/Documents/Thesis/Figures/SuppFigScoreLog_first_vitals_' + pop.name + '.pdf', bbox_inches = 'tight')
plt.show()

Cluster number optimisation (k-means and hierarchical clustering): recall at position

Values for hierarchical clustering are dysplayed for comparison only - a different cluster optimisation method was used in practice

In [ ]:
start=datetime.datetime.now()

# custom score for cluster number optimisation using recall at position. Should be maximised.
def pres_recommendation_perf(df_train, df_test, n_clusters):
    cluster_prescriptions_top_10 = {}
    
    for c in range(0, n_clusters):
        target=df_train[df_train['cluster'] == c]['new_prescriptions_grouped']
        pres_list =[]
        for x in target:
            pres_list.extend(x)    
        df_new_drugs = pd.DataFrame(pres_list,columns=['new_prescriptions_grouped'])
        df_cluster_pres = df_new_drugs['new_prescriptions_grouped'].value_counts().to_frame()
        df_cluster_pres.columns = ['prevalence']
        df_cluster_pres['rank'] = df_cluster_pres['prevalence'].rank(ascending=False) # dataframe with prescriptions as index and rank as column
        cluster_top_10 = df_cluster_pres[df_cluster_pres['rank'] <= 10].index.to_list()
        cluster_prescriptions_top_10["cluster_" + str(c)] = cluster_top_10
    
    def intersection(lst1, lst2):
        return list(set(lst1) & set(lst2))
 
    overall_perf_ = 0
    for i in df_test.index:
        cluster = df_test.loc[i, 'cluster']
        top_10_cluster = cluster_prescriptions_top_10["cluster_" + str(cluster)]   
        prescriptions = df_test.loc[i, 'new_prescriptions_grouped']
        recommended_pres = intersection(prescriptions, top_10_cluster)
        performance = len(recommended_pres)/len(prescriptions)      
        overall_perf_ += performance
    
    overall_perf = overall_perf_/len(df_test.index)
    
    return overall_perf

#dim_reduction = ['none', 'PCA', 'AE']
#clustering_method = ['kmeans', 'hierarchical']

# define the range of cluster number, latent dimensions (autoencoder), and variances explained by the principal components to include in the optimisation
number_clusters = range(2,31)
latent_dimension = range(int(len(variables)/5), len(variables), int(len(variables)/5))
n_components = [0.75, 0.8, 0.85, 0.9, 0.95]

np.random.seed(1301)
random.set_seed(2206)

def f_cluster_opti(variables, dim, method, n_components, n_latdim, n_clusters, df_train):
    # select optimal autoencoder architecture according to the input features set (see autoencoder parameter tuning)
    n_neurons = 64
    n_layers = 2    
    score_method = []

    # kfold crossvalidation
    X_gss = df_train
    y_gss = df_train.number_prescriptions_groups # only to allow function to run
    group = df_train.subject_id # prevent data leakage from same patient with several hospital admission
    group_kfold = GroupKFold(n_splits=8) # equivalent to a 70:10:20 split overall
    group_kfold.get_n_splits()
    
    df_results = pd.DataFrame(index = n_clusters)
    k = 1

    for opti_index, val_index in group_kfold.split(X_gss, y_gss, groups = group):
        df_opti = X_gss.iloc[opti_index].copy()
        df_val = X_gss.iloc[val_index].copy()
        df_opti.reset_index(inplace=True, drop=True)
        df_val.reset_index(inplace=True, drop=True)
        X = df_opti[variables]
        X_val = df_val[variables]

        # standardisation
        X_unscaled = X.to_numpy()
        X_scaled = StandardScaler().fit_transform(X_unscaled)
        X_val_unscaled = X_val.to_numpy()
        X_val_scaled = StandardScaler().fit_transform(X_val_unscaled)

        # dimensionality reduction
        if dim == 'PCA':
            pca = PCA(n_components=n_components, svd_solver='full')
            X_latent = pca.fit_transform(X_scaled)
            X_val_latent = pca.transform(X_val_scaled)

        elif dim == 'AE':
            dimension = len(variables)
            batch = 32

            # define encoder architecture
            input_data = keras.Input(shape=(dimension,))
            encoded = layers.Dense(n_neurons, activation='relu')(input_data)
            for encoder_layer in range(1, n_layers): 
                encoded = layers.Dense(n_neurons, activation='relu')(encoded)  
            
            # latent space
            encoded = layers.Dense(n_latdim, activation='relu')(encoded)

            # define decoder architecture
            decoded = layers.Dense(n_neurons, activation='relu')(encoded)         
            for decoder_layer in range(1, n_layers): 
                decoded = layers.Dense(n_neurons, activation='relu')(decoded)
            decoded = layers.Dense(dimension)(decoded)

            autoencoder = keras.Model(input_data, decoded)
            encoder = keras.Model(input_data, encoded)
            autoencoder.compile(optimizer='adam', loss='mean_absolute_error')
            
            # fit autoencoder
            autoencoder.fit(X_scaled, X_scaled, epochs=100, batch_size=batch, shuffle=True, verbose = 0)
            X_latent = encoder.predict(X_scaled)
            X_val_latent = encoder.predict(X_val_scaled)

        elif dim == 'none':
            X_latent = X_scaled
            X_val_latent = X_val_scaled

        else:
            print('dimensionality reduction error')

        # clustering
        results_cluster = []
        for n in n_clusters:

            if method == 'kmeans':
                seed_kmeans = np.random.randint(1,1001) # increase cluster search diversity
                kmeansModel = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100, n_clusters=n, n_init=10, 
                random_state=seed_kmeans, tol=0.0001, verbose=0)
                results_opti = kmeansModel.fit_predict(X_latent)
                results_val = kmeansModel.predict(X_val_latent)
                df_opti.loc[:,'cluster'] = results_opti.tolist()
                df_val.loc[:,'cluster'] = results_val.tolist()
                score_clust = pres_recommendation_perf(df_train = df_opti, df_test = df_val, n_clusters = n)

            elif method == 'hierarchical':
                h_clust = AgglomerativeClustering(n_clusters = n, affinity = 'euclidean', linkage ='average')
                results_opti = h_clust.fit_predict(X_latent)
                df_opti.loc[:,'cluster'] = results_opti.tolist()
                for i in df_val.index:
                    dist = np.square(X_latent - X_val_latent[i]).sum(axis=1)
                    df_opti['dist']=dist
                    target_cluster = np.argmin(df_opti.groupby(['cluster'])['dist'].mean())
                    df_val.loc[i,'cluster'] = target_cluster
                df_val['cluster'] = df_val['cluster'].apply(np.int64)
                score_clust = pres_recommendation_perf(df_train = df_opti, df_test = df_val, n_clusters = n)
                
            else:
                print('clustering model error')

            results_cluster.append(score_clust)
        
        df_results['split_'+str(k)] = results_cluster
        print(k)
        k = k+1
        
    results = df_results.mean(axis=1)
    
    return results
                
#--------------------------------------------------------------------------------------------------------

# get the mean recall at position values for each combination of dimensionality reduction and clustering methods

df_opti_none_kmeans = pd.DataFrame(index = number_clusters, columns = ['none_kmeans'])
new_col = f_cluster_opti(variables, dim='none', method = 'kmeans', n_components = 'not needed', n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_none_kmeans['none_kmeans'] = new_col
print('df_opti_none_kmeans done')

df_opti_none_hierarchical = pd.DataFrame(index = number_clusters, columns = ['none_hierarchical'])
new_col = f_cluster_opti(variables, dim='none', method = 'hierarchical', n_components = 'not needed', n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_none_hierarchical['none_hierarchical'] = new_col
print('df_opti_none_hierarchical done')

df_opti_PCA_kmeans = pd.DataFrame(index = number_clusters, columns = ['PCA_kmeans_0', 'PCA_kmeans_1', 'PCA_kmeans_2', 'PCA_kmeans_3', 'PCA_kmeans_4'])
for exvar, j in zip(n_components, [0, 1, 2, 3, 4]):
    new_col = f_cluster_opti(variables, dim='PCA', method = 'kmeans', n_components = exvar, n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
    df_opti_PCA_kmeans['PCA_kmeans_'+str(j)] = new_col
print('df_opti_PCA_kmeans done')
    
df_opti_PCA_hierarchical = pd.DataFrame(index = number_clusters, columns = ['PCA_hierarchical_0', 'PCA_hierarchical_1', 'PCA_hierarchical_2', 'PCA_hierarchical_3', 'PCA_hierarchical_4'])
for exvar, j in zip(n_components, [0, 1, 2, 3, 4]):
    new_col = f_cluster_opti(variables, dim='PCA', method = 'hierarchical', n_components = exvar, n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
    df_opti_PCA_hierarchical['PCA_hierarchical_'+str(j)] = new_col
print('df_opti_PCA_hierarchical done')
    
df_opti_AE_kmeans = pd.DataFrame(index = number_clusters, columns = ['AE_kmeans_0', 'AE_kmeans_1', 'AE_kmeans_2', 'AE_kmeans_3', 'AE_kmeans_4'])
for latdim, j in zip(latent_dimension, [0, 1, 2, 3, 4]):
    new_col = f_cluster_opti(variables, dim='AE', method = 'kmeans', n_components = 'not needed', n_latdim = latdim, n_clusters = number_clusters, df_train = df_train)
    df_opti_AE_kmeans['AE_kmeans_'+str(j)] = new_col
print('df_opti_AE_kmeans done')
    
df_opti_AE_hierarchical = pd.DataFrame(index = number_clusters, columns = ['AE_hierarchical_0', 'AE_hierarchical_1', 'AE_hierarchical_2', 'AE_hierarchical_3', 'AE_hierarchical_4'])
for latdim, j in zip(latent_dimension, [0, 1, 2, 3, 4]):
    new_col = f_cluster_opti(variables, dim='AE', method = 'hierarchical', n_components = 'not needed', n_latdim = latdim, n_clusters = number_clusters, df_train = df_train)
    df_opti_AE_hierarchical['AE_hierarchical_'+str(j)] = new_col
print('df_opti_AE_hierarchical done')
    
# save results for reuse in graph production if needed
df_opti_none_kmeans.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_Recall_none_kmeans_first_vitals.csv')
df_opti_none_hierarchical.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_Recall_none_hierarchical_first_vitals.csv')
df_opti_PCA_kmeans.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_Recall_PCA_kmeans_first_vitals.csv')
df_opti_PCA_hierarchical.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_Recall_PCA_hierarchical_first_vitals.csv')
df_opti_AE_kmeans.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_Recall_AE_kmeans_first_vitals.csv')
df_opti_AE_hierarchical.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_Recall_AE_hierarchical_first_vitals.csv')

# create data visualisation graphs
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(15,20))
fig.subplots_adjust(hspace=0.4)
axs=axs.ravel()

axs[0].plot(df_opti_none_kmeans['none_kmeans'])
axs[0].set_title('k-means without \ndimensionality reduction', fontsize = 16, pad = 15)
axs[0].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[0].set_ylabel('Recall at position 10', fontsize = 12, labelpad = 10)
#axs[0].legend(['basic', 'order', 'penalty'], loc = 'upper left')

axs[1].plot(df_opti_none_hierarchical['none_hierarchical'])
axs[1].set_title('hierarchical clustering without \ndimensionality reduction', fontsize = 16, pad = 15)
axs[1].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[1].set_ylabel('Recall at position 10', fontsize = 12, labelpad = 10)
#axs[1].legend(['basic', 'order', 'penalty'], loc = 'upper left')

axs[2].plot(df_opti_PCA_kmeans[['PCA_kmeans_0', 'PCA_kmeans_1', 'PCA_kmeans_2', 'PCA_kmeans_3', 'PCA_kmeans_4']])
axs[2].set_title('PCA + k-means', fontsize = 16, pad = 15)
axs[2].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[2].set_ylabel('Recall at position 10', fontsize = 12, labelpad = 10)
axs[2].legend(['exvar: '+str(n_components[0]), 'exvar: '+str(n_components[1]), 'exvar: '+str(n_components[2]), 'exvar: '+str(n_components[3]), 'exvar: '+str(n_components[4])], loc = 'upper left')

axs[3].plot(df_opti_PCA_hierarchical[['PCA_hierarchical_0', 'PCA_hierarchical_1', 'PCA_hierarchical_2', 'PCA_hierarchical_3', 'PCA_hierarchical_4']])
axs[3].set_title('PCA + hierarchical clustering', fontsize = 16, pad = 15)
axs[3].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[3].set_ylabel('Recall at position 10', fontsize = 12, labelpad = 10)
axs[3].legend(['exvar: '+str(n_components[0]), 'exvar: '+str(n_components[1]), 'exvar: '+str(n_components[2]), 'exvar: '+str(n_components[3]), 'exvar: '+str(n_components[4])], loc = 'upper left')

axs[4].plot(df_opti_AE_kmeans[['AE_kmeans_0', 'AE_kmeans_1', 'AE_kmeans_2', 'AE_kmeans_3', 'AE_kmeans_4']])
axs[4].set_title('Autoencoder + k-means', fontsize = 16, pad = 15)
axs[4].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[4].set_ylabel('Recall at position 10', fontsize = 12, labelpad = 10)
axs[4].legend(['latdim: '+str(latent_dimension[0]), 'latdim: '+str(latent_dimension[1]), 'latdim: '+str(latent_dimension[2]), 'latdim: '+str(latent_dimension[3]), 'latdim: '+str(latent_dimension[4])], loc = 'upper left')

axs[5].plot(df_opti_AE_hierarchical[['AE_hierarchical_0', 'AE_hierarchical_1', 'AE_hierarchical_2', 'AE_hierarchical_3', 'AE_hierarchical_4']])
axs[5].set_title('Autoencoder + hierarchical clustering', fontsize = 16, pad = 15)
axs[5].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[5].set_ylabel('Recall at position 10', fontsize = 12, labelpad = 10)
axs[5].legend(['latdim: '+str(latent_dimension[0]), 'latdim: '+str(latent_dimension[1]), 'latdim: '+str(latent_dimension[2]), 'latdim: '+str(latent_dimension[3]), 'latdim: '+str(latent_dimension[4])], loc = 'upper left')

plt.savefig('C:/Users/baptiste.vasey/Documents/Thesis/Figures/SuppFigRecall_first_vitals_' + pop.name + '.pdf', bbox_inches = 'tight')
plt.show()

Cluster number optimisation (k-means and hierarchical clustering): score proportion

Values for hierarchical clustering are dysplayed for comparison only - a different cluster optimisation method was used in practice

In [ ]:
start=datetime.datetime.now()

# custom score for cluster number optimisation using the custom proportion score. Should be maximised.
def pres_recommendation_score_prop(df_train, df_test, n_clusters):
    cluster_prescriptions_top_10_prop = {}
    
    for c in range(0, n_clusters):
        target=df_train[df_train['cluster'] == c]['new_prescriptions_grouped']
        pres_list =[]
        
        for x in target:
            pres_list.extend(x)
            
        number_pres_tot = len(pres_list)
        df_new_drugs = pd.DataFrame(pres_list,columns=['new_prescriptions_grouped'])
        df_cluster_pres = df_new_drugs['new_prescriptions_grouped'].value_counts().to_frame()
        df_cluster_pres.columns = ['prevalence']
        df_cluster_pres['proportion'] = df_cluster_pres['prevalence']/number_pres_tot
        df_cluster_pres['%_hadm_receiving'] = df_cluster_pres['prevalence']/len(df_train[df_train['cluster'] == c].index)
        df_cluster_pres['rank'] = df_cluster_pres['prevalence'].rank(ascending=False) # dataframe with prescriptions as index and rank as column
        cluster_top_10_prop = df_cluster_pres[df_cluster_pres['rank'] <= 10]
        cluster_prescriptions_top_10_prop["cluster_" + str(c)] = cluster_top_10_prop
    
    score = 0
    
    for i in df_test.index:
        cluster = df_test.loc[i, 'cluster']
        top_10_cluster = cluster_prescriptions_top_10_prop["cluster_" + str(cluster)].index.to_list()
        score_ = 0
        number_prescription = len(df_test.loc[i, 'new_prescriptions_grouped'])
        
        for drug in df_test.loc[i, 'new_prescriptions_grouped']:
            if drug in top_10_cluster:
                score_drug = cluster_prescriptions_top_10_prop["cluster_" + str(cluster)].loc[drug, '%_hadm_receiving']
            else:
                score_drug = 0
            score_ += score_drug

        # average score in order to compare patients with different number of prescriptions
        score_i = score_/number_prescription
        score += score_i
        
    final_score = score/len(df_test.index)
    return final_score

#dim_reduction = ['none', 'PCA', 'AE']
#clustering_method = ['kmeans', 'hierarchical']

# define the range of cluster number, latent dimensions (autoencoder), and variances explained by the principal components to include in the optimisation
number_clusters = range(2,31)
latent_dimension = range(int(len(variables)/5), len(variables), int(len(variables)/5))
n_components = [0.75, 0.8, 0.85, 0.9, 0.95]

np.random.seed(1301)
random.set_seed(2206)

def f_cluster_opti(variables, dim, method, n_components, n_latdim, n_clusters, df_train):
    # select optimal autoencoder architecture according to the input features set (see autoencoder parameter tuning)
    n_neurons = 64
    n_layers = 2    
    score_method = []

    # kfold crossvalidation
    X_gss = df_train
    y_gss = df_train.number_prescriptions_groups # only to allow function to run
    group = df_train.subject_id # prevent data leakage from same patient with several hospital admission
    group_kfold = GroupKFold(n_splits=8) # equivalent to a 70:10:20 split overall
    group_kfold.get_n_splits()
    
    df_results = pd.DataFrame(index = n_clusters)
    k = 1

    for opti_index, val_index in group_kfold.split(X_gss, y_gss, groups = group):
        df_opti = X_gss.iloc[opti_index].copy()
        df_val = X_gss.iloc[val_index].copy()
        df_opti.reset_index(inplace=True, drop=True)
        df_val.reset_index(inplace=True, drop=True)
        X = df_opti[variables]
        X_val = df_val[variables]

        # standardisation
        X_unscaled = X.to_numpy()
        X_scaled = StandardScaler().fit_transform(X_unscaled)
        X_val_unscaled = X_val.to_numpy()
        X_val_scaled = StandardScaler().fit_transform(X_val_unscaled)

        # dimensionality reduction
        if dim == 'PCA':
            pca = PCA(n_components=n_components, svd_solver='full')
            X_latent = pca.fit_transform(X_scaled)
            X_val_latent = pca.transform(X_val_scaled)

        elif dim == 'AE':
            dimension = len(variables)
            batch = 32

            # define encoder architecture
            input_data = keras.Input(shape=(dimension,))
            encoded = layers.Dense(n_neurons, activation='relu')(input_data)
            for encoder_layer in range(1, n_layers): 
                encoded = layers.Dense(n_neurons, activation='relu')(encoded)  
                
            # latent space
            encoded = layers.Dense(n_latdim, activation='relu')(encoded)

            # define decoder architecture
            decoded = layers.Dense(n_neurons, activation='relu')(encoded)         
            for decoder_layer in range(1, n_layers): 
                decoded = layers.Dense(n_neurons, activation='relu')(decoded)
            decoded = layers.Dense(dimension)(decoded)

            autoencoder = keras.Model(input_data, decoded)
            encoder = keras.Model(input_data, encoded)
            autoencoder.compile(optimizer='adam', loss='mean_absolute_error')
            
            # fit autoencoder
            autoencoder.fit(X_scaled, X_scaled, epochs=100, batch_size=batch, shuffle=True, verbose = 0)
            X_latent = encoder.predict(X_scaled)
            X_val_latent = encoder.predict(X_val_scaled)

        elif dim == 'none':
            X_latent = X_scaled
            X_val_latent = X_val_scaled

        else:
            print('dimensionality reduction error')

        # clustering
        results_cluster = []
        for n in n_clusters:

            if method == 'kmeans':
                seed_kmeans = np.random.randint(1,1001) # increase cluster search diversity
                kmeansModel = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100, n_clusters=n, n_init=10, 
                random_state=seed_kmeans, tol=0.0001, verbose=0)
                results_opti = kmeansModel.fit_predict(X_latent)
                results_val = kmeansModel.predict(X_val_latent)
                df_opti.loc[:,'cluster'] = results_opti.tolist()
                df_val.loc[:,'cluster'] = results_val.tolist()
                score_clust = pres_recommendation_score_prop(df_train = df_opti, df_test = df_val, n_clusters = n)

            elif method == 'hierarchical':
                h_clust = AgglomerativeClustering(n_clusters = n, affinity = 'euclidean', linkage ='ward')
                results_opti = h_clust.fit_predict(X_latent)
                df_opti.loc[:,'cluster'] = results_opti.tolist()
                for i in df_val.index:
                    dist = np.square(X_latent - X_val_latent[i]).sum(axis=1)
                    df_opti['dist']=dist
                    target_cluster = np.argmin(df_opti.groupby(['cluster'])['dist'].mean())
                    df_val.loc[i,'cluster'] = target_cluster
                df_val['cluster'] = df_val['cluster'].apply(np.int64)
                score_clust = pres_recommendation_score_prop(df_train = df_opti, df_test = df_val, n_clusters = n)   
                
            else:
                print('clustering model error')

            results_cluster.append(score_clust)
        
        df_results['split_'+str(k)] = results_cluster
        print(k)
        k = k+1
        
    results = df_results.mean(axis=1)
    
    return results
    
#--------------------------------------------------------------------------------------------------------

# get the mean proportion score values for each combination of dimensionality reduction and clustering methods

df_opti_none_kmeans = pd.DataFrame(index = number_clusters, columns = ['none_kmeans'])
new_col = f_cluster_opti(variables, dim='none', method = 'kmeans', n_components = 'not needed', n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_none_kmeans['none_kmeans'] = new_col
print('df_opti_none_kmeans done')

df_opti_none_hierarchical = pd.DataFrame(index = number_clusters, columns = ['none_hierarchical'])
new_col = f_cluster_opti(variables, dim='none', method = 'hierarchical', n_components = 'not needed', n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
df_opti_none_hierarchical['none_hierarchical'] = new_col
print('df_opti_none_hierarchical done')

df_opti_PCA_kmeans = pd.DataFrame(index = number_clusters, columns = ['PCA_kmeans_0', 'PCA_kmeans_1', 'PCA_kmeans_2', 'PCA_kmeans_3', 'PCA_kmeans_4'])
for exvar, j in zip(n_components, [0, 1, 2, 3, 4]):
    new_col = f_cluster_opti(variables, dim='PCA', method = 'kmeans', n_components = exvar, n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
    df_opti_PCA_kmeans['PCA_kmeans_'+str(j)] = new_col
print('df_opti_PCA_kmeans done')
    
df_opti_PCA_hierarchical = pd.DataFrame(index = number_clusters, columns = ['PCA_hierarchical_0', 'PCA_hierarchical_1', 'PCA_hierarchical_2', 'PCA_hierarchical_3', 'PCA_hierarchical_4'])
for exvar, j in zip(n_components, [0, 1, 2, 3, 4]):
    new_col = f_cluster_opti(variables, dim='PCA', method = 'hierarchical', n_components = exvar, n_latdim = 'not needed', n_clusters = number_clusters, df_train = df_train)
    df_opti_PCA_hierarchical['PCA_hierarchical_'+str(j)] = new_col
print('df_opti_PCA_hierarchical done')
    
df_opti_AE_kmeans = pd.DataFrame(index = number_clusters, columns = ['AE_kmeans_0', 'AE_kmeans_1', 'AE_kmeans_2', 'AE_kmeans_3', 'AE_kmeans_4'])
for latdim, j in zip(latent_dimension, [0, 1, 2, 3, 4]):
    new_col = f_cluster_opti(variables, dim='AE', method = 'kmeans', n_components = 'not needed', n_latdim = latdim, n_clusters = number_clusters, df_train = df_train)
    df_opti_AE_kmeans['AE_kmeans_'+str(j)] = new_col
print('df_opti_AE_kmeans done')
    
df_opti_AE_hierarchical = pd.DataFrame(index = number_clusters, columns = ['AE_hierarchical_0', 'AE_hierarchical_1', 'AE_hierarchical_2', 'AE_hierarchical_3', 'AE_hierarchical_4'])
for latdim, j in zip(latent_dimension, [0, 1, 2, 3, 4]):
    new_col = f_cluster_opti(variables, dim='AE', method = 'hierarchical', n_components = 'not needed', n_latdim = latdim, n_clusters = number_clusters, df_train = df_train)
    df_opti_AE_hierarchical['AE_hierarchical_'+str(j)] = new_col
print('df_opti_AE_hierarchical done')
    
# save results for reuse in graph production if needed
df_opti_none_kmeans.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreProp_none_kmeans_first_vitals.csv')
df_opti_none_hierarchical.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreProp_none_hierarchical_first_vitals.csv')
df_opti_PCA_kmeans.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreProp_PCA_kmeans_first_vitals.csv')
df_opti_PCA_hierarchical.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreProp_PCA_hierarchical_first_vitals.csv')
df_opti_AE_kmeans.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreProp_AE_kmeans_first_vitals.csv')
df_opti_AE_hierarchical.to_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreProp_AE_hierarchical_first_vitals.csv')

# create data visualisation graphs
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(15,20))
fig.subplots_adjust(hspace=0.4)
axs=axs.ravel()

axs[0].plot(df_opti_none_kmeans['none_kmeans'])
axs[0].set_title('k-means without \ndimensionality reduction', fontsize = 16, pad = 15)
axs[0].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[0].set_ylabel('Custom proportion score value', fontsize = 12, labelpad = 10)
#axs[0].legend(['basic', 'order', 'penalty'], loc = 'upper left')

axs[1].plot(df_opti_none_hierarchical['none_hierarchical'])
axs[1].set_title('hierarchical clustering without \ndimensionality reduction', fontsize = 16, pad = 15)
axs[1].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[1].set_ylabel('Custom proportion score value', fontsize = 12, labelpad = 10)
#axs[1].legend(['basic', 'order', 'penalty'], loc = 'upper left')

axs[2].plot(df_opti_PCA_kmeans[['PCA_kmeans_0', 'PCA_kmeans_1', 'PCA_kmeans_2', 'PCA_kmeans_3', 'PCA_kmeans_4']])
axs[2].set_title('PCA + k-means', fontsize = 16, pad = 15)
axs[2].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[2].set_ylabel('Custom proportion score value', fontsize = 12, labelpad = 10)
axs[2].legend(['exvar: '+str(n_components[0]), 'exvar: '+str(n_components[1]), 'exvar: '+str(n_components[2]), 'exvar: '+str(n_components[3]), 'exvar: '+str(n_components[4])], loc = 'upper left')

axs[3].plot(df_opti_PCA_hierarchical[['PCA_hierarchical_0', 'PCA_hierarchical_1', 'PCA_hierarchical_2', 'PCA_hierarchical_3', 'PCA_hierarchical_4']])
axs[3].set_title('PCA + hierarchical clustering', fontsize = 16, pad = 15)
axs[3].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[3].set_ylabel('Custom proportion score value', fontsize = 12, labelpad = 10)
axs[3].legend(['exvar: '+str(n_components[0]), 'exvar: '+str(n_components[1]), 'exvar: '+str(n_components[2]), 'exvar: '+str(n_components[3]), 'exvar: '+str(n_components[4])], loc = 'upper left')

axs[4].plot(df_opti_AE_kmeans[['AE_kmeans_0', 'AE_kmeans_1', 'AE_kmeans_2', 'AE_kmeans_3', 'AE_kmeans_4']])
axs[4].set_title('Autoencoder + k-means', fontsize = 16, pad = 15)
axs[4].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[4].set_ylabel('Custom proportion score value', fontsize = 12, labelpad = 10)
axs[4].legend(['latdim: '+str(latent_dimension[0]), 'latdim: '+str(latent_dimension[1]), 'latdim: '+str(latent_dimension[2]), 'latdim: '+str(latent_dimension[3]), 'latdim: '+str(latent_dimension[4])], loc = 'upper left')

axs[5].plot(df_opti_AE_hierarchical[['AE_hierarchical_0', 'AE_hierarchical_1', 'AE_hierarchical_2', 'AE_hierarchical_3', 'AE_hierarchical_4']])
axs[5].set_title('Autoencoder + hierarchical clustering', fontsize = 16, pad = 15)
axs[5].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[5].set_ylabel('Custom proportion score value', fontsize = 12, labelpad = 10)
axs[5].legend(['latdim: '+str(latent_dimension[0]), 'latdim: '+str(latent_dimension[1]), 'latdim: '+str(latent_dimension[2]), 'latdim: '+str(latent_dimension[3]), 'latdim: '+str(latent_dimension[4])], loc = 'upper left')

plt.savefig('C:/Users/baptiste.vasey/Documents/Thesis/Figures/SuppFigScoreProp_first_vitals_' + pop.name + '.pdf', bbox_inches = 'tight')
plt.show()

Performance baseline: random prescription recommendation

In [ ]:
# recall at position 10
def random_performance(df_train, df_test):
    np.random.seed(2206)
    unique_set = set() 
    
    # create a set of all prescription groups
    for i in df_train.index:
        index_pres = df_train.new_prescriptions_grouped[i]
        new_pres = set(index_pres) - unique_set
        unique_set.update(new_pres)

    def intersection(lst1, lst2):
        return list(set(lst1) & set(lst2))
    
    overall_perf_ = 0
    
    for i in df_test.index:
        # pick a random sample of 10 prescriptions without replacement
        random_pick = np.random.choice(list(unique_set), 10, replace=False)
        # calculate the recall at position 10 value
        prescriptions = df_test.loc[i, 'new_prescriptions_grouped']
        recommended_pres = intersection(prescriptions, random_pick)
        performance = len(recommended_pres)/len(prescriptions)
        overall_perf_ += performance

    overall_perf = overall_perf_/len(df_test.index)
    return overall_perf

# score proportion
def random_performance_prop(df_train, df_test):
    np.random.seed(2206)
    unique_set = set() 
    
    create a set of all prescription groups
    for i in df_train.index:
        index_pres = df_train.new_prescriptions_grouped[i]
        new_pres = set(index_pres) - unique_set
        unique_set.update(new_pres)
    
    # gets proportion of prescriptions in train set
    target=df_train['new_prescriptions_grouped']
    pres_list =[]
    for x in target:
        pres_list.extend(x)     
    number_pres_tot = len(pres_list)
    df_new_drugs = pd.DataFrame(pres_list,columns=['new_prescriptions_grouped'])
    df_pres = df_new_drugs['new_prescriptions_grouped'].value_counts().to_frame()
    df_pres.columns = ['prevalence']
    df_pres['proportion'] = df_pres['prevalence']/number_pres_tot
    df_pres['%_hadm_receiving'] = df_pres['prevalence']/len(df_train.index)
    
    score = 0
    for i in df_test.index:
        # pick a random sample of 10 prescriptions without replacement
        random_pick = np.random.choice(list(unique_set), 10, replace=False)
        # calculate the proportion score
        score_ = 0
        number_prescription = len(df_test.loc[i, 'new_prescriptions_grouped'])
        for drug in df_test.loc[i, 'new_prescriptions_grouped']:
            if drug in random_pick:
                score_drug = df_pres.loc[drug, '%_hadm_receiving']
            else:
                score_drug = 0
            score_ += score_drug

        # average score in order to compare patients with different number of prescriptions
        score_i = score_/number_prescription
        score += score_i

    final_score = score/len(df_test.index)
    return final_score

print('Performance of random selected list of 10 prescriptions: ' +str(random_performance(df_train, df_test)))
print('Performance of random selected list of 10 prescriptions with proportion: ' +str(random_performance_prop(df_train, df_test)))

Performance baseline: random procedure recommendation

In [ ]:
# recall at position 10
def random_performance(df_train, df_test):
    np.random.seed(2206)
    unique_set = set()
    
    # create a set of all procedure groups
    for i in df_train.index:
        index_proc = df_train.new_procedures_grouped[i]
        new_proc = set(index_proc) - unique_set
        unique_set.update(new_proc)

    def intersection(lst1, lst2):
        return list(set(lst1) & set(lst2))
    
    overall_perf_ = 0
    for i in df_test.index:
        # pick a random sample of 10 procedures without replacement
        random_pick = np.random.choice(list(unique_set), 10, replace=False)
        # calculate the recall at position 10 value
        procedures = df_test.loc[i, 'new_procedures_grouped']
        recommended_proc = intersection(procedures, random_pick)
        performance = len(recommended_proc)/len(procedures)
        overall_perf_ += performance

    overall_perf = overall_perf_/len(df_test.index)
    return overall_perf

# score proportion
def random_performance_prop(df_train, df_test):
    np.random.seed(2206)
    unique_set = set() 
    
    # create a set of all procedure groups
    for i in df_train.index:
        index_proc = df_train.new_procedures_grouped[i]
        new_proc = set(index_proc) - unique_set
        unique_set.update(new_proc)
    
    # gets proportion of procedures in train set
    target=df_train['new_procedures_grouped']
    proc_list =[]
    for x in target:
        proc_list.extend(x)
    number_proc_tot = len(proc_list)
    df_new_drugs = pd.DataFrame(proc_list,columns=['new_procedures_grouped'])
    df_proc = df_new_drugs['new_procedures_grouped'].value_counts().to_frame()
    df_proc.columns = ['prevalence']
    df_proc['proportion'] = df_proc['prevalence']/number_proc_tot
    df_proc['%_hadm_receiving'] = df_proc['prevalence']/len(df_train.index)
    
    score = 0
    for i in df_test.index:
        # pick a random sample of 10 procedures without replacement
        random_pick = np.random.choice(list(unique_set), 10, replace=False)
        # calculate the proportion score
        score_ = 0
        number_procedure = len(df_test.loc[i, 'new_procedures_grouped'])
        for drug in df_test.loc[i, 'new_procedures_grouped']:
            if drug in random_pick:
                score_drug = df_proc.loc[drug, '%_hadm_receiving']
            else:
                score_drug = 0
            score_ += score_drug

        # average score in order to compare patients with different number of procedures
        score_i = score_/number_procedure
        score += score_i

    final_score = score/len(df_test.index)
    return final_score

print('Performance of random selected list of 10 procedures: ' +str(random_performance(df_train, df_test)))
print('Performance of random selected list of 10 procedures with proportion: ' +str(random_performance_prop(df_train, df_test)))

Performance baseline: top 10 prescriptions

In [ ]:
# recall at position 10
def top10_performance(df_train, df_test):
    #creates a list of all prescribed drug groups
    target=df_train['new_prescriptions_grouped']
    pres_list =[]
    for x in target:
        pres_list.extend(x)
        
    # identify the top 10 most prescribed drug groups    
    top_10 = [pres for pres, cnt in Counter(pres_list).most_common(10)]
    print(top_10)

    def intersection(lst1, lst2):
        return list(set(lst1) & set(lst2))

    # calculate the recall at position 10 value
    overall_perf_ = []
    for i in df_test.index:
        prescriptions = df_test.loc[i, 'new_prescriptions_grouped']
        recommended_pres = intersection(prescriptions, top_10)
        performance = len(recommended_pres)/len(prescriptions)
        overall_perf_.append(performance)
    overall_perf = np.mean(overall_perf_)
    
    return overall_perf

# score proportion
def top10_performance_prop(df_train, df_test):
    #creates a list of all precribed drug groups
    target=df_train['new_prescriptions_grouped']
    pres_list =[]
    for x in target:
        pres_list.extend(x)
        
    # identify the top 10 most prescribed drug groups and the proportion of admissions receiving them
    number_pres_tot = len(pres_list)
    df_new_drugs = pd.DataFrame(pres_list,columns=['new_prescriptions_grouped'])
    df_pres = df_new_drugs['new_prescriptions_grouped'].value_counts().to_frame()
    df_pres.columns = ['prevalence']
    df_pres['proportion'] = df_pres['prevalence']/number_pres_tot
    df_pres['%_hadm_receiving'] = df_pres['prevalence']/len(df_train.index)
    df_pres['rank'] = df_pres['prevalence'].rank(ascending=False) # dataframe with prescriptions as index and rank as column
    top_10 = df_pres[df_pres['rank'] <= 10].index.to_list()
    print(df_pres[df_pres['rank'] <= 10])
    
    # calculate the proportion score
    score = 0
    for i in df_test.index:
        score_ = 0
        number_prescription = len(df_test.loc[i, 'new_prescriptions_grouped'])
        for drug in df_test.loc[i, 'new_prescriptions_grouped']:
            if drug in top_10:
                score_drug = df_pres.loc[drug, '%_hadm_receiving']
            else:
                score_drug = 0
            score_ += score_drug

        # average score in order to compare patients with different number of prescriptions
        score_i = score_/number_prescription
        score += score_i

    final_score = score/len(df_test.index)
    return final_score

print('Performance of most prescribed list of 10 prescriptions: ' +str(top10_performance(df_train, df_test)))
print('Performance of most prescribed list of 10 prescriptions with proportion: ' +str(top10_performance_prop(df_train, df_test)))

Performance baseline: top 10 procedures

In [ ]:
# recall at position 10
def top10_performance(df_train, df_test):
    #creates a list of all prescribed procedure groups
    target=df_train['new_procedures_grouped']
    proc_list =[]
    for x in target:
        proc_list.extend(x)
        
    # identify the top 10 most prescribed procedure groups    
    top_10 = [proc for proc, cnt in Counter(proc_list).most_common(10)]
    print(top_10)

    def intersection(lst1, lst2):
        return list(set(lst1) & set(lst2))
    
    # calculate the recall at position 10 value
    overall_perf_ = []
    for i in df_test.index:
        procedures = df_test.loc[i, 'new_procedures_grouped']
        recommended_proc = intersection(procedures, top_10)
        performance = len(recommended_proc)/len(procedures)
        overall_perf_.append(performance)
    overall_perf = np.mean(overall_perf_)
    
    return overall_perf

# score proportion
def top10_performance_prop(df_train, df_test):  
    #creates a list of all precribed drug groups
    target=df_train['new_procedures_grouped']
    proc_list =[]
    for x in target:
        proc_list.extend(x)
        
    # identify the top 10 most prescribed procedure groups and the proportion of admissions receiving them
    number_proc_tot = len(proc_list)
    df_new_drugs = pd.DataFrame(proc_list,columns=['new_procedures_grouped'])
    df_proc = df_new_drugs['new_procedures_grouped'].value_counts().to_frame()
    df_proc.columns = ['prevalence']
    df_proc['proportion'] = df_proc['prevalence']/number_proc_tot
    df_proc['%_hadm_receiving'] = df_proc['prevalence']/len(df_train.index)
    df_proc['rank'] = df_proc['prevalence'].rank(ascending=False) # dataframe with procedures as index and rank as column
    top_10 = df_proc[df_proc['rank'] <= 10].index.to_list()
    print(df_proc[df_proc['rank'] <= 10])
    
    # calculate the proportion score
    score = 0
    for i in df_test.index:
        score_ = 0
        number_procedure = len(df_test.loc[i, 'new_procedures_grouped'])
        for drug in df_test.loc[i, 'new_procedures_grouped']:
            if drug in top_10:
                score_drug = df_proc.loc[drug, '%_hadm_receiving']
            else:
                score_drug = 0
            score_ += score_drug

        # average score in order to compare patients with different number of procedures
        score_i = score_/number_procedure
        score += score_i

    final_score = score/len(df_test.index)
    return final_score

print('Performance of most prescribed list of 10 procedures: ' +str(top10_performance(df_train, df_test)))
print('Performance of most prescribed list of 10 procedures with proportion: ' +str(top10_performance_prop(df_train, df_test)))

Task 1: clustering models performance analysis (prescriptions)

In [ ]:
# recall at position 10
def pres_recommendation_perf(df_train, df_test, n_clusters):
    # for each cluster, get the top 10 prescriptions with the proportion of admissions in the cluster receiving them
    cluster_prescriptions_top_10 = {}
    for c in range(0, n_clusters):
        target=df_train[df_train['cluster'] == c]['new_prescriptions_grouped']
        pres_list =[]
        for x in target:
            pres_list.extend(x)    
        df_new_drugs = pd.DataFrame(pres_list,columns=['new_prescriptions_grouped'])
        df_cluster_pres = df_new_drugs['new_prescriptions_grouped'].value_counts().to_frame()
        df_cluster_pres.columns = ['prevalence']
        df_cluster_pres['rank'] = df_cluster_pres['prevalence'].rank(ascending=False) # dataframe with prescriptions as index and rank as column
        cluster_top_10 = df_cluster_pres[df_cluster_pres['rank'] <= 10].index.to_list()
        cluster_prescriptions_top_10["cluster_" + str(c)] = cluster_top_10
    
    def intersection(lst1, lst2):
        return list(set(lst1) & set(lst2))
 
    # calculate the recall at position 10 value (test patient's prescriptions compared to the top 10 prescriptions in the most similar cluster)
    overall_perf_ = 0
    for i in df_test.index:
        cluster = df_test.loc[i, 'cluster']
        top_10_cluster = cluster_prescriptions_top_10["cluster_" + str(cluster)]   
        prescriptions = df_test.loc[i, 'new_prescriptions_grouped']
        recommended_pres = intersection(prescriptions, top_10_cluster)
        performance = len(recommended_pres)/len(prescriptions)      
        overall_perf_ += performance
    
    overall_perf = overall_perf_/len(df_test.index)
    return overall_perf
 
# proportion score
def pres_recommendation_score_prop(df_train, df_test, n_clusters):
    # for each cluster, get the top 10 prescriptions with the proportion of admissions in the cluster receiving them
    cluster_prescriptions_top_10_prop = {}
    for c in range(0, n_clusters):
        target=df_train[df_train['cluster'] == c]['new_prescriptions_grouped']
        pres_list =[]
        for x in target:
            pres_list.extend(x)
        number_pres_tot = len(pres_list)
        df_new_drugs = pd.DataFrame(pres_list,columns=['new_prescriptions_grouped'])
        df_cluster_pres = df_new_drugs['new_prescriptions_grouped'].value_counts().to_frame()
        df_cluster_pres.columns = ['prevalence']
        df_cluster_pres['proportion'] = df_cluster_pres['prevalence']/number_pres_tot
        df_cluster_pres['%_hadm_receiving'] = df_cluster_pres['prevalence']/len(df_train[df_train['cluster'] == c].index)
        df_cluster_pres['rank'] = df_cluster_pres['prevalence'].rank(ascending=False) # dataframe with prescriptions as index and rank as column
        cluster_top_10_prop = df_cluster_pres[df_cluster_pres['rank'] <= 10]
        cluster_prescriptions_top_10_prop["cluster_" + str(c)] = cluster_top_10_prop
    
    # calculate the proportion score (test patient's prescriptions compared to the top 10 prescriptions in the most similar cluster)
    score = 0
    for i in df_test.index:
        cluster = df_test.loc[i, 'cluster']
        top_10_cluster = cluster_prescriptions_top_10_prop["cluster_" + str(cluster)].index.to_list()
        score_ = 0
        number_prescription = len(df_test.loc[i, 'new_prescriptions_grouped'])
        for drug in df_test.loc[i, 'new_prescriptions_grouped']:
            if drug in top_10_cluster:
                score_drug = cluster_prescriptions_top_10_prop["cluster_" + str(cluster)].loc[drug, '%_hadm_receiving']
            else:
                score_drug = 0
            score_ += score_drug

        # average score in order to compare patients with different number of prescriptions
        score_i = score_/number_prescription
        score += score_i
        
    final_score = score/len(df_test.index)
    return final_score
    
#--------------------------------------------------------------------------------------------------------------------

# select input features set
variables = ['HR', 'RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU', 'ESTIMATED_FIO2_2', 'ALB', 'ALP', 'ALT', 'APTT', 'AST', 'BAS', 'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP', 'COCA', 'EGFR', 'EOS',
             'GLU', 'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM', 'MCH', 'MCHC', 'MCV', 'METHB', 'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD', 'TBIL', 'TROP', 'UR', 'WBC', 'age',
             'cci', 'post_firstOP_time_hours', 'post_lastOP_time_hours', 'ALB_delta', 'ALP_delta', 'ALT_delta', 'APTT_delta', 'AST_delta', 'BAS_delta', 'BASEX_delta', 'BICAR_delta', 'CAL_delta', 
             'CL_delta', 'CR_delta', 'CRP_delta', 'COCA_delta', 'EGFR_delta', 'EOS_delta', 'GLU_delta', 'HCT_delta', 'HGB_delta', 'INR_delta', 'KET_delta', 'LAC_delta', 'LYM_delta', 'MCH_delta', 
             'MCHC_delta', 'MCV_delta', 'METHB_delta', 'MON_delta', 'MPV_delta', 'NEU_delta', 'OSM_delta', 'PH_delta', 'PLT_delta', 'POT_delta', 'RBC_delta', 'SOD_delta', 'TBIL_delta', 
             'TROP_delta', 'UR_delta', 'WBC_delta', 'HR_minmax', 'HR_delta', 'RR_minmax', 'RR_delta', 'DBP_minmax', 'DBP_delta', 'SBP_minmax', 'SBP_delta', 'SPO2_minmax', 'SPO2_delta', 
             'TEMP_minmax', 'TEMP_delta', 'AVPU_minmax', 'AVPU_delta', 'ESTIMATED_FIO2_2_minmax', 'ESTIMATED_FIO2_2_delta']

# (paste autoencoder parameters according to input features set selected)
n_neurons = 128
n_layers = 2
latent_dimension = 42

n_components = 0.8 # number of components selected to explain > 80% of the variance

# select optimal cluster number for each combination according to the cluster number optimisation results
n_cluster_PCA_kmeans = 6
n_cluster_AE_kmeans = 4
n_cluster_PCA_hierarchical = 3
n_cluster_AE_hierarchical = 4

# set random seed
np.random.seed(1301)
random.set_seed(2206)

# data preparation
X = df_train[variables]
X_test = df_test[variables]
X_unscaled = X.to_numpy()
X_scaled = StandardScaler().fit_transform(X_unscaled)
X_test_unscaled = X_test.to_numpy()
X_test_scaled = StandardScaler().fit_transform(X_test_unscaled)

#--------------------------------------------------------------------------------------------------------------------

# PCA + k-means

# dimensionality reduction
pca = PCA(n_components=n_components, svd_solver='full')
X_latent = pca.fit_transform(X_scaled)
X_test_latent = pca.transform(X_test_scaled)
print('number of components: ' + str(pca.n_components_))

# define the k-means model
kmeansModel = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100, n_clusters=n_cluster_PCA_kmeans, n_init=10, 
random_state=2206, tol=0.0001, verbose=0)

# fit the model and asign contact points (in both the train and test sets) to clusters
results_train = kmeansModel.fit_predict(X_latent)
results_test = kmeansModel.predict(X_test_latent)
df_train.loc[:,'cluster'] = results_train.tolist()
df_test.loc[:,'cluster'] = results_test.tolist()

# assess performance                
perf_PCA_kmeans = pres_recommendation_perf(df_train, df_test, n_cluster_PCA_kmeans)
score_prop_PCA_kmeans = pres_recommendation_score_prop(df_train, df_test, n_cluster_PCA_kmeans)
print('++++++++++++++++++')
print('Recall PCA + k-means: ' + str(perf_PCA_kmeans))
print('Score prop PCA + k-means: ' + str(score_prop_PCA_kmeans))

#--------------------------------------------------------------------------------------------------------------------

# PCA + hierarchical clustering

# dimensionality reduction
pca = PCA(n_components=n_components, svd_solver='full')
X_latent = pca.fit_transform(X_scaled)
X_test_latent = pca.transform(X_test_scaled)
print('number of components: ' + str(pca.n_components_))

# define the hierarchical clustering model
h_clust = AgglomerativeClustering(n_clusters = n_cluster_PCA_hierarchical, affinity = 'euclidean', linkage ='ward')

# fit the model and asign contact points of the training set to clusters
results_train = h_clust.fit_predict(X_latent)
df_train.loc[:,'cluster'] = results_train.tolist()

# attribute each contact point of the test set to the closest cluster using the average method
for i in df_test.index:
    ave_cluster_dist = []
    for cluster in range(0, n_cluster_PCA_hierarchical):
        average_dist = np.mean(np.square(X_latent[df_train[df_train['cluster']==cluster].index] - X_test_latent[i]).sum(axis=1))
        ave_cluster_dist.append(average_dist)   
    target_cluster = np.argmin(ave_cluster_dist)
    df_test.loc[i,'cluster'] = target_cluster

# assess performance
perf_PCA_hierarchical = pres_recommendation_perf(df_train, df_test, n_cluster_PCA_hierarchical)
score_prop_PCA_hierarchical = pres_recommendation_score_prop(df_train, df_test, n_cluster_PCA_hierarchical)
print('++++++++++++++++++')
print('Recall PCA + hierarchical: ' + str(perf_PCA_hierarchical))
print('Score prop PCA + hierarchical: ' + str(score_prop_PCA_hierarchical))

#--------------------------------------------------------------------------------------------------------------------

# AE + k-means

# dimensionality reduction
dimension = len(variables)
batch = int(len(X.index)/2)

input_data = keras.Input(shape=(dimension,))
encoded = layers.Dense(n_neurons, activation='relu')(input_data)
for encoder_layer in range(1, n_layers): 
    encoded = layers.Dense(n_neurons, activation='relu')(encoded)  
    
encoded = layers.Dense(latent_dimension, activation='relu')(encoded)

decoded = layers.Dense(n_neurons, activation='relu')(encoded)         
for decoder_layer in range(1, n_layers): 
    decoded = layers.Dense(n_neurons, activation='relu')(decoded)
decoded = layers.Dense(dimension)(decoded)

autoencoder = keras.Model(input_data, decoded)
encoder = keras.Model(input_data, encoded)
autoencoder.compile(optimizer='adam', loss='mean_absolute_error')
autoencoder.fit(X_scaled, X_scaled, epochs=200, batch_size=batch, shuffle=True, verbose = 0)
X_latent = encoder.predict(X_scaled)
X_test_latent = encoder.predict(X_test_scaled)

# define the k-means model
kmeansModel = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100, n_clusters=n_cluster_AE_kmeans, n_init=10, 
random_state=2206, tol=0.0001, verbose=0)

# fit the model and asign contact points (in both the train and test sets) to clusters
results_train = kmeansModel.fit_predict(X_latent)
results_test = kmeansModel.predict(X_test_latent)
df_train.loc[:,'cluster'] = results_train.tolist()
df_test.loc[:,'cluster'] = results_test.tolist()

# assess performance
perf_AE_kmeans = pres_recommendation_perf(df_train, df_test, n_cluster_AE_kmeans)
score_prop_AE_kmeans = pres_recommendation_score_prop(df_train, df_test, n_cluster_AE_kmeans)
print('++++++++++++++++++')
print('Recall AE + k-means: ' + str(perf_AE_kmeans))
print('Score prop AE + k-means: ' + str(score_prop_AE_kmeans))

#--------------------------------------------------------------------------------------------------------------------

# AE + hierarchical clustering

# dimensionality reduction
dimension = len(variables)
batch = int(len(X.index)/2)

input_data = keras.Input(shape=(dimension,))
encoded = layers.Dense(n_neurons, activation='relu')(input_data)
for encoder_layer in range(1, n_layers): 
    encoded = layers.Dense(n_neurons, activation='relu')(encoded)  
    
encoded = layers.Dense(latent_dimension, activation='relu')(encoded)

decoded = layers.Dense(n_neurons, activation='relu')(encoded)         
for decoder_layer in range(1, n_layers): 
    decoded = layers.Dense(n_neurons, activation='relu')(decoded)
decoded = layers.Dense(dimension)(decoded)

autoencoder = keras.Model(input_data, decoded)
encoder = keras.Model(input_data, encoded)
autoencoder.compile(optimizer='adam', loss='mean_absolute_error')
autoencoder.fit(X_scaled, X_scaled, epochs=100, batch_size=batch, shuffle=True, verbose = 0)
X_latent = encoder.predict(X_scaled)
X_test_latent = encoder.predict(X_test_scaled)

# define hierarchical clustering model
h_clust = AgglomerativeClustering(n_clusters = n_cluster_AE_hierarchical, affinity = 'euclidean', linkage ='ward')

# fit the model and asign contact points of the training set to clusters
results_train = h_clust.fit_predict(X_latent)
df_train.loc[:,'cluster'] = results_train.tolist()

# attribute each contact point of the test set to the closest cluster using the average method
for i in df_test.index:
    ave_cluster_dist = []
    for cluster in range(0, n_cluster_AE_hierarchical):
        average_dist = np.mean(np.square(X_latent[df_train[df_train['cluster']==cluster].index] - X_test_latent[i]).sum(axis=1))
        ave_cluster_dist.append(average_dist)   
    target_cluster = np.argmin(ave_cluster_dist)
    df_test.loc[i,'cluster'] = target_cluster

# assess performance
perf_AE_hierarchical = pres_recommendation_perf(df_train, df_test, n_cluster_AE_hierarchical)
score_prop_AE_hierarchical = pres_recommendation_score_prop(df_train, df_test, n_cluster_AE_hierarchical)
print('++++++++++++++++++')
print('Performance AE + hierarchical: ' + str(perf_AE_hierarchical))
print('Score prop AE + hierarchical: ' + str(score_prop_AE_hierarchical))

# dysplay the top ten prescriptions in each of the n clusters given by the best performing model combination.
# calculate in each of these same clusters the proprtion of the admissions which will experience a critical event within 24h of the contact point.
print('++++++++++++++++++')
print('best clusters') # in this case, 4 clusters produced by a combination of autoencoder and hierarchical clustering
for n in range(0, n_cluster_AE_hierarchical):
    cluster_ = df_train[df_train['cluster']==n]['new_prescriptions_grouped']
    print('size cluster: ' + str(len(cluster_)))
    
    risk = len(df_train[(df_train['cluster']==n) & (df_train['event']==1)])/len(df_train[df_train['cluster']==n])
    print('risk: '+ str(risk))
    
    pres_list =[]
    for x in cluster_:
        pres_list.extend(x)
    df_new_drugs = pd.DataFrame(pres_list,columns=['new_prescriptions_grouped'])
    df_pres = df_new_drugs['new_prescriptions_grouped'].value_counts().to_frame()
    df_pres.columns = ['prevalence']
    df_pres['%_hadm_receiving'] = df_pres['prevalence']/len(cluster_.index)
    df_pres['rank'] = df_pres['prevalence'].rank(ascending=False) # dataframe with prescriptions as index and rank as column
    print(df_pres[df_pres['rank'] <= 10])

Task 2: clustering models performance analysis (procedures)

In [ ]:
# recall at position 10
def proc_recommendation_perf(df_train, df_test, n_clusters):
    # for each cluster, get the top 10 procedures with the proportion of admissions in the cluster receiving them
    cluster_procedures_top_10 = {}
    for c in range(0, n_clusters):
        target=df_train[df_train['cluster'] == c]['new_procedures_grouped']
        proc_list =[]
        for x in target:
            proc_list.extend(x)    
        df_new_drugs = pd.DataFrame(proc_list,columns=['new_procedures_grouped'])
        df_cluster_proc = df_new_drugs['new_procedures_grouped'].value_counts().to_frame()
        df_cluster_proc.columns = ['prevalence']
        df_cluster_proc['rank'] = df_cluster_proc['prevalence'].rank(ascending=False) # dataframe with procedures as index and rank as column
        cluster_top_10 = df_cluster_proc[df_cluster_proc['rank'] <= 10].index.to_list()
        cluster_procedures_top_10["cluster_" + str(c)] = cluster_top_10
    
    def intersection(lst1, lst2):
        return list(set(lst1) & set(lst2))
 
    # calculate the recall at position 10 value (test patient's procedures compared to the top 10 procedures in the most similar cluster)
    overall_perf_ = 0
    for i in df_test.index:
        cluster = df_test.loc[i, 'cluster']
        top_10_cluster = cluster_procedures_top_10["cluster_" + str(cluster)]   
        procedures = df_test.loc[i, 'new_procedures_grouped']
        recommended_proc = intersection(procedures, top_10_cluster)
        performance = len(recommended_proc)/len(procedures)      
        overall_perf_ += performance
    
    overall_perf = overall_perf_/len(df_test.index)
    return overall_perf

# score proportion
def proc_recommendation_score_prop(df_train, df_test, n_clusters):
    # for each cluster, get the top 10 procedures with the proportion of admissions in the cluster receiving them
    cluster_procedures_top_10_prop = {}
    for c in range(0, n_clusters):
        target=df_train[df_train['cluster'] == c]['new_procedures_grouped']
        proc_list =[]
        for x in target:
            proc_list.extend(x)
        number_proc_tot = len(proc_list)
        df_new_drugs = pd.DataFrame(proc_list,columns=['new_procedures_grouped'])
        df_cluster_proc = df_new_drugs['new_procedures_grouped'].value_counts().to_frame()
        df_cluster_proc.columns = ['prevalence']
        df_cluster_proc['proportion'] = df_cluster_proc['prevalence']/number_proc_tot
        df_cluster_proc['%_hadm_receiving'] = df_cluster_proc['prevalence']/len(df_train[df_train['cluster'] == c].index)
        df_cluster_proc['rank'] = df_cluster_proc['prevalence'].rank(ascending=False) # dataframe with procedures as index and rank as column
        cluster_top_10_prop = df_cluster_proc[df_cluster_proc['rank'] <= 10]
        cluster_procedures_top_10_prop["cluster_" + str(c)] = cluster_top_10_prop
    
    # calculate the proportion score (test patient's procedures compared to the top 10 procedures in the most similar cluster)
    score = 0
    for i in df_test.index:
        cluster = df_test.loc[i, 'cluster']
        top_10_cluster = cluster_procedures_top_10_prop["cluster_" + str(cluster)].index.to_list()
        score_ = 0
        number_procedure = len(df_test.loc[i, 'new_procedures_grouped'])
        for drug in df_test.loc[i, 'new_procedures_grouped']:
            if drug in top_10_cluster:
                score_drug = cluster_procedures_top_10_prop["cluster_" + str(cluster)].loc[drug, '%_hadm_receiving']
            else:
                score_drug = 0
            score_ += score_drug

        # average score in order to compare patients with different number of procedures
        score_i = score_/number_procedure
        score += score_i
        
    final_score = score/len(df_test.index)
    return final_score

#--------------------------------------------------------------------------------------------------------------------

# select input features set
variables = ['HR', 'RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU', 'ESTIMATED_FIO2_2', 'ALB', 'ALP', 'ALT', 'APTT', 'AST', 'BAS', 'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP', 'COCA', 'EGFR', 'EOS',
             'GLU', 'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM', 'MCH', 'MCHC', 'MCV', 'METHB', 'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD', 'TBIL', 'TROP', 'UR', 'WBC', 'age',
             'cci', 'post_firstOP_time_hours', 'post_lastOP_time_hours', 'ALB_delta', 'ALP_delta', 'ALT_delta', 'APTT_delta', 'AST_delta', 'BAS_delta', 'BASEX_delta', 'BICAR_delta', 'CAL_delta', 
             'CL_delta', 'CR_delta', 'CRP_delta', 'COCA_delta', 'EGFR_delta', 'EOS_delta', 'GLU_delta', 'HCT_delta', 'HGB_delta', 'INR_delta', 'KET_delta', 'LAC_delta', 'LYM_delta', 'MCH_delta', 
             'MCHC_delta', 'MCV_delta', 'METHB_delta', 'MON_delta', 'MPV_delta', 'NEU_delta', 'OSM_delta', 'PH_delta', 'PLT_delta', 'POT_delta', 'RBC_delta', 'SOD_delta', 'TBIL_delta', 
             'TROP_delta', 'UR_delta', 'WBC_delta', 'HR_minmax', 'HR_delta', 'RR_minmax', 'RR_delta', 'DBP_minmax', 'DBP_delta', 'SBP_minmax', 'SBP_delta', 'SPO2_minmax', 'SPO2_delta', 
             'TEMP_minmax', 'TEMP_delta', 'AVPU_minmax', 'AVPU_delta', 'ESTIMATED_FIO2_2_minmax', 'ESTIMATED_FIO2_2_delta']

# (paste autoencoder parameters according to input features set selected)
n_neurons = 128
n_layers = 2
latent_dimension = 42

n_components = 0.8 # number of components selected to explain > 80% of the variance

# select optimal cluster number for each combination according to the cluster number optimisation results
n_cluster_PCA_kmeans = 6
n_cluster_AE_kmeans = 4
n_cluster_PCA_hierarchical = 3
n_cluster_AE_hierarchical = 4

# set random seed
np.random.seed(1301)
random.set_seed(2206)

# data preparation
X = df_train[variables]
X_test = df_test[variables]
X_unscaled = X.to_numpy()
X_scaled = StandardScaler().fit_transform(X_unscaled)
X_test_unscaled = X_test.to_numpy()
X_test_scaled = StandardScaler().fit_transform(X_test_unscaled)

#--------------------------------------------------------------------------------------------------------------------

# PCA + k-means

# dimensionality reduction
pca = PCA(n_components=n_components, svd_solver='full')
X_latent = pca.fit_transform(X_scaled)
X_test_latent = pca.transform(X_test_scaled)

# define k-means model
kmeansModel = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100, n_clusters=n_cluster_PCA_kmeans, n_init=10, 
random_state=2206, tol=0.0001, verbose=0)

# fit the model and asign contact points (in both the train and test sets) to clusters
results_train = kmeansModel.fit_predict(X_latent)
results_test = kmeansModel.predict(X_test_latent)
df_train.loc[:,'cluster'] = results_train.tolist()
df_test.loc[:,'cluster'] = results_test.tolist()

# assess performance
perf_PCA_kmeans = proc_recommendation_perf(df_train, df_test, n_cluster_PCA_kmeans)
score_prop_PCA_kmeans = proc_recommendation_score_prop(df_train, df_test, n_cluster_PCA_kmeans)
print('++++++++++++++++++')
print('Performance PCA + k-means: ' + str(perf_PCA_kmeans))
print('Score prop PCA + k-means: ' + str(score_prop_PCA_kmeans))

#--------------------------------------------------------------------------------------------------------------------
# PCA + hierarchical clustering

# dimensionality reduction
pca = PCA(n_components=n_components, svd_solver='full')
X_latent = pca.fit_transform(X_scaled)
X_test_latent = pca.transform(X_test_scaled)

# define hierarchical clustering model
h_clust = AgglomerativeClustering(n_clusters = n_cluster_PCA_hierarchical, affinity = 'euclidean', linkage ='ward')

# fit the model and asign contact points of the training set to clusters
results_train = h_clust.fit_predict(X_latent)
df_train.loc[:,'cluster'] = results_train.tolist()

# attribute each contact point of the test set to the closest cluster using the average method
for i in df_test.index:
    ave_cluster_dist = []
    for cluster in range(0, n_cluster_PCA_hierarchical):
        average_dist = np.mean(np.square(X_latent[df_train[df_train['cluster']==cluster].index] - X_test_latent[i]).sum(axis=1))
        ave_cluster_dist.append(average_dist)   
    target_cluster = np.argmin(ave_cluster_dist)
    df_test.loc[i,'cluster'] = target_cluster

# assess performance
perf_PCA_hierarchical = proc_recommendation_perf(df_train, df_test, n_cluster_PCA_hierarchical)
score_prop_PCA_hierarchical = proc_recommendation_score_prop(df_train, df_test, n_cluster_PCA_hierarchical)
print('++++++++++++++++++')
print('Performance PCA + hierarchical: ' + str(perf_PCA_hierarchical))
print('Score prop PCA + hierarchical: ' + str(score_prop_PCA_hierarchical))

#--------------------------------------------------------------------------------------------------------------------

# AE + k-means

# dimensionality reduction
dimension = len(variables)
batch = int(len(X.index)/2)

input_data = keras.Input(shape=(dimension,))
encoded = layers.Dense(n_neurons, activation='relu')(input_data)
for encoder_layer in range(1, n_layers): 
    encoded = layers.Dense(n_neurons, activation='relu')(encoded)  
    
encoded = layers.Dense(latent_dimension, activation='relu')(encoded)

decoded = layers.Dense(n_neurons, activation='relu')(encoded)         
for decoder_layer in range(1, n_layers): 
    decoded = layers.Dense(n_neurons, activation='relu')(decoded)
decoded = layers.Dense(dimension)(decoded)

autoencoder = keras.Model(input_data, decoded)
encoder = keras.Model(input_data, encoded)
autoencoder.compile(optimizer='adam', loss='mean_absolute_error')
autoencoder.fit(X_scaled, X_scaled, epochs=200, batch_size=batch, shuffle=True, verbose = 0)
X_latent = encoder.predict(X_scaled)
X_test_latent = encoder.predict(X_test_scaled)

# define k-means model
kmeansModel = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100, n_clusters=n_cluster_AE_kmeans, n_init=10, 
random_state=2206, tol=0.0001, verbose=0)

# fit the model and asign contact points (in both the train and test sets) to clusters
results_train = kmeansModel.fit_predict(X_latent)
results_test = kmeansModel.predict(X_test_latent)
df_train.loc[:,'cluster'] = results_train.tolist()
df_test.loc[:,'cluster'] = results_test.tolist()

# assess performance
perf_AE_kmeans = proc_recommendation_perf(df_train, df_test, n_cluster_AE_kmeans)
score_prop_AE_kmeans = proc_recommendation_score_prop(df_train, df_test, n_cluster_AE_kmeans)
print('++++++++++++++++++')
print('Performance AE + k-means: ' + str(perf_AE_kmeans))
print('Score prop AE + k-means: ' + str(score_prop_AE_kmeans))

#--------------------------------------------------------------------------------------------------------------------
# AE + hierarchical clustering

# dimensionality reduction
dimension = len(variables)
batch = int(len(X.index)/2)

input_data = keras.Input(shape=(dimension,))
encoded = layers.Dense(n_neurons, activation='relu')(input_data)
for encoder_layer in range(1, n_layers): 
    encoded = layers.Dense(n_neurons, activation='relu')(encoded)  
    
encoded = layers.Dense(latent_dimension, activation='relu')(encoded)

decoded = layers.Dense(n_neurons, activation='relu')(encoded)         
for decoder_layer in range(1, n_layers): 
    decoded = layers.Dense(n_neurons, activation='relu')(decoded)
decoded = layers.Dense(dimension)(decoded)

autoencoder = keras.Model(input_data, decoded)
encoder = keras.Model(input_data, encoded)
autoencoder.compile(optimizer='adam', loss='mean_absolute_error')
autoencoder.fit(X_scaled, X_scaled, epochs=100, batch_size=batch, shuffle=True, verbose = 0)
X_latent = encoder.predict(X_scaled)
X_test_latent = encoder.predict(X_test_scaled)

# define hierarchical clustering model
h_clust = AgglomerativeClustering(n_clusters = n_cluster_AE_hierarchical, affinity = 'euclidean', linkage ='ward')

# fit the model and asign contact points of the training set to clusters
results_train = h_clust.fit_predict(X_latent)
df_train.loc[:,'cluster'] = results_train.tolist()

# attribute each contact point of the test set to the closest cluster using the average method
for i in df_test.index:
    ave_cluster_dist = []
    for cluster in range(0, n_cluster_AE_hierarchical):
        average_dist = np.mean(np.square(X_latent[df_train[df_train['cluster']==cluster].index] - X_test_latent[i]).sum(axis=1))
        ave_cluster_dist.append(average_dist)   
    target_cluster = np.argmin(ave_cluster_dist)
    df_test.loc[i,'cluster'] = target_cluster

# assess performance
perf_AE_hierarchical = proc_recommendation_perf(df_train, df_test, n_cluster_AE_hierarchical)
score_prop_AE_hierarchical = proc_recommendation_score_prop(df_train, df_test, n_cluster_AE_hierarchical)
print('++++++++++++++++++')
print('Performance AE + hierarchical: ' + str(perf_AE_hierarchical))
print('Score prop AE + hierarchical: ' + str(score_prop_AE_hierarchical))

Figures production

Requires df_clustering_ready and df_raw

Figure 4

dataset characteristics

In [ ]:
# select postoperative contact points on the ward (including recovery time)
df_fig4 = df_graphs_all[df_graphs_all['status'].isin([2, 6])]

# plot figure 4
pdf = plt.figure(figsize=(12,17))
plt.suptitle(t = pop.description, fontsize = 20, y = 0.93)

Fig1 = plt.subplot(321)
df_fig4[df_fig4.alarm == 1]['score_total'].plot(kind='hist', bins = np.arange(2.5, 15.5, 1), color = 'darkorange')
plt.axvline(x=6.5, color='black', linestyle='-')
Fig1.set_title('Distribution of NEWS values', fontsize = 14, pad = 10)
Fig1.set_xlabel('NEWS values', fontsize = 12)
Fig1.set_ylabel('Frequency', fontsize = 12)

Fig2 = plt.subplot(322)
df_fig4[df_fig4.alarm == 1]['hadm_id'].value_counts().plot(kind='hist', bins=np.arange(0.5, 101.5, 1))
Fig2.set_title('Number of triggers per admission', fontsize = 14, pad = 10)
Fig2.set_xlabel('Number of triggers', fontsize = 12)
Fig2.set_ylabel('Frequency', fontsize = 12)

Fig3 = plt.subplot(323)
df_fig4['post_firstOP_time_h'] = df_fig4['post_firstOP_time'] / np.timedelta64(1, 'h')
df_fig4[df_fig4.alarm == 1]['post_firstOP_time_h'].plot(kind='hist', bins=100, range=(0,400), color = 'crimson')
Fig3.set_title('Distribution of NEWS triggers over time', fontsize = 14, pad = 10)
Fig3.set_xlabel('Post operative time (hours)', fontsize = 12)
Fig3.set_ylabel('Frequency', fontsize = 12)

Fig4 = plt.subplot(324)
df_fig4['post_firstOP_time_h'] = df_fig4['post_firstOP_time'] / np.timedelta64(1, 'h')
df_fig4[(df_fig4['alarm'] == 1)].drop_duplicates(subset=['hadm_id'], keep='first')['post_firstOP_time_h'].plot(kind='hist', bins=100, range=(0,200), color = 'darkcyan')
Fig4.set_title('Distribution of first alarm triggers over time', fontsize = 14, pad = 10)
Fig4.set_xlabel('Post operative time (hours)', fontsize = 12)
Fig4.set_ylabel('Frequency', fontsize = 12)

plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.2, hspace=0.3)
plt.show()

pdf.savefig('C:/Users/baptiste.vasey/Documents/Thesis/Figures/Fig4_' + pop.name + '.pdf', bbox_inches = 'tight')

Figure 5

description of the intervals between the timepoints of input data collection

In [ ]:
# select observations and lab values during a postoperative stay on the ward (without recovery period)
df_fig5=df_raw[df_raw['status'] == 2].copy()

# select information to display: 'vitals' for only the SEND entries, 'labs' for only the laboratory entries, 'POCT' for only POCT entries, 'all' for all combined
record = 'vitals'
# select the time of the day to consider: 'all_day' for all day, 'period' for only a selected period of time. If period is selected, enter the start and end time
time = 'all_day'
start = '20:00'
end = '07:00'

# select data accordingly
if record == 'vitals':
    df_rec = df_fig5.dropna(subset = ['charttime']).copy(deep=True)
    title1 = 'vital signs'
elif record == 'labs':
    df_rec = df_fig5.dropna(subset = ['labtime']).copy(deep=True)
    title1 = 'labs'
elif record == 'POCT':
    df_rec = df_fig5.dropna(subset = ['collection_time']).copy(deep=True)
    title1 = 'POCT'
elif record == 'all':
    df_rec = df_fig5.copy(deep=True)
    title1 = 'vital signs, labs and POCT'

if time == 'period':
    df_time = df_rec.copy(deep=True)
    df_time = df_time.set_index(pd.DatetimeIndex(df_time['obstime']))
    df_time = df_time.between_time(start, end)
    title2 = '\n (without recording from ' + end + ' to ' + start + ')'
elif time == 'all_day':
    df_time = df_rec.copy(deep=True)
    title2 = ''

# time unit adjustments
df_time['interval_s']=df_time['interval']/datetime.timedelta(seconds=1)
df_time['interval_h']=df_time['interval']/datetime.timedelta(hours=1)
df_time['time_since_hadm_s']=df_time['time_since_hadm']/datetime.timedelta(seconds=1)
df_time['time_since_hadm_h']=df_time['time_since_hadm']/datetime.timedelta(hours=1)

# plot figure 5
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))

# interval between 2 recordings
df_time['interval_h'].plot(ax=ax1, kind='hist', bins = range(0,36), fontsize = 14) # use range(0,36) for vitals and range(0,72) for labs
ax1.set_title('Distribution of the intervals between 2 ' + title1 + '\nrecording over time', fontsize = 14, pad = 10)
ax1.set_xlabel('interval between two recording (h)', fontsize = 12)
ax1.set_ylabel('number of recording', fontsize = 12)
ax1.tick_params(labelsize=10)
# (for labs graph) ax1.axvline(x=26, color='black', linestyle='--')

# evolution of the interval between 2 recordings according to the patient length of stay (boxplot)
df_time['stay_group'] = pd.cut(df_time['time_since_hadm_h'], bins=range(0, 2501, 120))
df_time.boxplot(ax=ax2, column='interval_h', by='stay_group', grid=False, whis=[5, 95], showfliers=False)
ax2.set_xticklabels([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20])
ax2.set_title('Evolution of intervals between 2 ' + title1 + '\nrecording over time', fontsize = 14, pad = 10)
ax2.set_xlabel('length of stay (by group of 5 days)', fontsize = 12)
ax2.set_ylabel('interval between 2 recordings (h)', fontsize = 12)
plt.suptitle('')
# (for labs graph) ax2.axhline(y=26, color='black', linestyle='--')

# adapt file name according to data used
fig.savefig('C:/Users/baptiste.vasey/Documents/Thesis/Figures/Fig2_vitals_' + pop.name + '.pdf', bbox_inches = 'tight')

Figure 6

distribution of the number of new prescription within 24 hours according to the occurence of NEWS alarms and critical events following a contact point

In [ ]:
# select postoperative contact points on the ward (without recovery period)
df_fig6 = df_graphs_all[df_graphs_all['status']==2]

# produce a dataframe for each of the combinations of interest
s1 = pd.DataFrame(df_fig6[(df_fig6['event'] == 0)]['number_prescriptions_groups'])
s2 = pd.DataFrame(df_fig6[(df_fig6['event'] == 1)]['number_prescriptions_groups'])
s3 = pd.DataFrame(df_fig6[(df_fig6['alarm'] == 0)]['number_prescriptions_groups'])
s4 = pd.DataFrame(df_fig6[(df_fig6['alarm'] == 1)]['number_prescriptions_groups'])
s5 = pd.DataFrame(df_fig6[(df_fig6['event'] == 0) & (df_fig6['alarm'] == 0)]['number_prescriptions_groups'])
s6 = pd.DataFrame(df_fig6[(df_fig6['event'] == 0) & (df_fig6['alarm'] == 1)]['number_prescriptions_groups'])
s7 = pd.DataFrame(df_fig6[(df_fig6['event'] == 1) & (df_fig6['alarm'] == 0)]['number_prescriptions_groups'])
s8 = pd.DataFrame(df_fig6[(df_fig6['event'] == 1) & (df_fig6['alarm'] == 1)]['number_prescriptions_groups'])

s1['group'] = '1.No_event'
s2['group'] = '2.Event'
s3['group'] = '3.No_alarm'
s4['group'] = '4.Alarm'
s5['group'] = '5.No_alarm_no_event'
s6['group'] = '6.Alarm_no_event'
s7['group'] = '7.No_alarm_event'
s8['group'] = '8.Alarm_and_event'

# used to adapt legend
print(len(s1.index))
print(len(s2.index))
print(len(s3.index))
print(len(s4.index))
print(len(s5.index))
print(len(s6.index))
print(len(s7.index))
print(len(s8.index))

# plot figure 6
df_plot = pd.concat([s1, s2, s3, s4, s5, s6, s7, s8])
ax=df_plot.boxplot(by='group', grid=False, whis=[5, 95], showfliers=False)
ax.set_xticklabels(['No event\n(187,330)', 'Event\n(4052)', 'No alarm\n(170,370)', 'Alarm\n(21,012)', 'No alarm, no event\n(167,716)', 'Alarm, no event\n(19,614)', 'No alarm, event\n(2654)', 'Alarm and event\n(1398)'], rotation=70)
ax.set_title('Number of new prescriptions within 24h according\nto the alarm and event status', fontsize = 14, pad = 10)
ax.set_xlabel('Presence/abscence of alarm and event', fontsize = 12)
ax.set_ylabel('Number of new prescriptions\nwithin 24h', fontsize = 12)
plt.suptitle('')

plt.savefig('C:/Users/baptiste.vasey/Documents/Thesis/Figures/Fig6_' + pop.name + '.pdf', bbox_inches = 'tight')
plt.show()

Figure 7

cluster number optimisation for the k-means algorithm using the custom proportion score

In [ ]:
# choose optimisation score and input features set of interest
df_res_PCA_kmeans = pd.read_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreProp_PCA_kmeans_first_all.csv', index_col=[0])
df_res_AE_kmeans = pd.read_csv('C:/Users/baptiste.vasey/Documents/Thesis/Provisory_dataframe/results_cluster_optimisation_ScoreProp_AE_kmeans_first_all.csv', index_col=[0])

# paste according latent dimension and n_components variables
latent_dimension = [21, 42, 63, 84, 105]
n_components = [0.75, 0.8, 0.85, 0.9, 0.95]

# plot figure 7
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(14,6))
fig.subplots_adjust(hspace=0.35, wspace=0.25)
axs=axs.ravel()

axs[0].plot(df_res_PCA_kmeans[['PCA_kmeans_0', 'PCA_kmeans_1', 'PCA_kmeans_2', 'PCA_kmeans_3', 'PCA_kmeans_4']])
axs[0].set_title('PCA + kmeans:\nvalidation results', fontsize = 16, pad = 15)
axs[0].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[0].set_ylabel('Custom proportion score value', fontsize = 12, labelpad = 10)
axs[0].legend(['exvar: '+str(n_components[0]), 'exvar: '+str(n_components[1]), 'exvar: '+str(n_components[2]), 'exvar: '+str(n_components[3]), 'exvar: '+str(n_components[4])], loc = 'lower right')

axs[1].plot(df_res_AE_kmeans[['AE_kmeans_0', 'AE_kmeans_1', 'AE_kmeans_2', 'AE_kmeans_3', 'AE_kmeans_4']])
axs[1].set_title('Autoencoder + hierarchical clustering:\nvalidation results', fontsize = 16, pad = 15)
axs[1].set_xlabel('Number of clusters', fontsize = 12, labelpad = 10)
axs[1].set_ylabel('Custom proportion score value', fontsize = 12, labelpad = 10)
axs[1].legend(['latdim: '+str(latent_dimension[0]), 'latdim: '+str(latent_dimension[1]), 'latdim: '+str(latent_dimension[2]), 'latdim: '+str(latent_dimension[3]), 'latdim: '+str(latent_dimension[4])], loc = 'lower right')

# adapt to optimisation score and input features set selected
plt.savefig('C:/Users/baptiste.vasey/Documents/Thesis/Figures/Fig7_ScoreProp_all_' + pop.name + '.pdf', bbox_inches = 'tight')
plt.show()

Figure 8

Cluster number optimisation for the hierarchical clustering algorithm using the dendrogram method

In [ ]:
# select input features set
variables = ['HR', 'RR', 'DBP', 'SBP', 'SPO2', 'TEMP', 'AVPU', 'ESTIMATED_FIO2_2', 'ALB', 'ALP', 'ALT', 'APTT', 'AST', 'BAS', 'BASEX', 'BICAR', 'CAL', 'CL', 'CR', 'CRP', 'COCA', 'EGFR', 'EOS',
             'GLU', 'HCT', 'HGB', 'INR', 'KET', 'LAC', 'LYM', 'MCH', 'MCHC', 'MCV', 'METHB', 'MON', 'MPV', 'NEU', 'OSM', 'PH', 'PLT', 'POT', 'RBC', 'SOD', 'TBIL', 'TROP', 'UR', 'WBC', 'age',
             'cci', 'post_firstOP_time_hours', 'post_lastOP_time_hours', 'ALB_delta', 'ALP_delta', 'ALT_delta', 'APTT_delta', 'AST_delta', 'BAS_delta', 'BASEX_delta', 'BICAR_delta', 'CAL_delta', 
             'CL_delta', 'CR_delta', 'CRP_delta', 'COCA_delta', 'EGFR_delta', 'EOS_delta', 'GLU_delta', 'HCT_delta', 'HGB_delta', 'INR_delta', 'KET_delta', 'LAC_delta', 'LYM_delta', 'MCH_delta', 
             'MCHC_delta', 'MCV_delta', 'METHB_delta', 'MON_delta', 'MPV_delta', 'NEU_delta', 'OSM_delta', 'PH_delta', 'PLT_delta', 'POT_delta', 'RBC_delta', 'SOD_delta', 'TBIL_delta', 
             'TROP_delta', 'UR_delta', 'WBC_delta', 'HR_minmax', 'HR_delta', 'RR_minmax', 'RR_delta', 'DBP_minmax', 'DBP_delta', 'SBP_minmax', 'SBP_delta', 'SPO2_minmax', 'SPO2_delta', 
             'TEMP_minmax', 'TEMP_delta', 'AVPU_minmax', 'AVPU_delta', 'ESTIMATED_FIO2_2_minmax', 'ESTIMATED_FIO2_2_delta']

X = df_train[variables]

# standardisation
X_unscaled = X.to_numpy()
X_scaled = StandardScaler().fit_transform(X_unscaled)

#PCA
pca = PCA(n_components=0.8, svd_solver='full')
X_PCA_latent = pca.fit_transform(X_scaled)

#AutoEncoder (paste parameters according to input features set selected)
dimension = len(variables)
batch = 32
n_neurons = 128
n_layers = 2  
n_latdim = 42

input_data = keras.Input(shape=(dimension,))
encoded = layers.Dense(n_neurons, activation='relu')(input_data)
for encoder_layer in range(1, n_layers): 
    encoded = layers.Dense(n_neurons, activation='relu')(encoded)
    
encoded = layers.Dense(n_latdim, activation='relu')(encoded)

decoded = layers.Dense(n_neurons, activation='relu')(encoded)         
for decoder_layer in range(1, n_layers): 
    decoded = layers.Dense(n_neurons, activation='relu')(decoded)
decoded = layers.Dense(dimension)(decoded)

autoencoder = keras.Model(input_data, decoded)
encoder = keras.Model(input_data, encoded)
autoencoder.compile(optimizer='adam', loss='mean_absolute_error')
autoencoder.fit(X_scaled, X_scaled, epochs=100, batch_size=batch, shuffle=True, verbose = 0)
X_AE_latent = encoder.predict(X_scaled)

# define the dendrogram plotting function (adapted from https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html)
def plot_dendrogram(model, **kwargs):
    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

# define and fit the hierarchical clustering models with different dimensionality reduction methods    
model_PCA = AgglomerativeClustering(distance_threshold=0, n_clusters=None, affinity = 'euclidean', linkage ='ward')
model_PCA = model_PCA.fit(X_PCA_latent)

model_AE = AgglomerativeClustering(distance_threshold=0, n_clusters=None, affinity = 'euclidean', linkage ='ward')
model_AE = model_AE.fit(X_AE_latent)

# plot figure 8
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(14, 5))
fig.subplots_adjust(wspace=0.25)

plot_dendrogram(model_PCA, ax=ax1, truncate_mode="level", p=4, leaf_font_size=13, leaf_rotation=90, color_threshold = 60)
ax1.set_title("PCA + Hierarchical Clustering Dendrogram", fontsize = 16, pad = 15)
ax1.set_xlabel("Number of points in mid-level clusters (nodes)\n(or index of single point cluster if no parenthesis)", fontsize = 12, labelpad = 15)
ax1.set_ylabel("Distance between clusters\n(ward method)", fontsize = 12)
ax1.tick_params(labelsize=12)

plot_dendrogram(model_AE, ax=ax2, truncate_mode="level", p=4, leaf_font_size=12, leaf_rotation=90, color_threshold = 80)
ax2.set_title("AE + Hierarchical Clustering Dendrogram", fontsize = 16, pad = 15)
ax2.set_xlabel("Number of points in mid-level clusters (nodes)\n(or index of single point cluster if no parenthesis)", fontsize = 12, labelpad = 15)
ax2.set_ylabel("Distance between clusters\n(ward method)", fontsize = 12)
ax2.tick_params(labelsize=12)

# adapt name to input features set selected
plt.savefig('C:/Users/baptiste.vasey/Documents/Thesis/Figures/Fig8_hierarchical_all_' + pop.name + '.pdf', bbox_inches = 'tight')
plt.show()