##################################
#### FIGURE CREATION COMMANDS ####
##################################

## Commands to create a boxplot of the distribution of F-scores for each pipeline, coloured by variant caller
## This creates Figure 7

library(ggplot2)
library(data.table)

df<-read.table('~/varcall_evaluation/commandsForOriginalFigures/evaluationOutput/summary_statistics.tsv',sep='\t',header=T)
df.sub<-subset(df, df$Genome != 'rhb11c04') # this sample, an E. coli from cattle faeces (see Supplementary Text 8), is a notable outlier on all pipelines
df<-df.sub

f_score<-df$F.score...2...precision..recall..precision...recall..
aligner<-df$Aligner
caller<-df$Caller
aligner_and_caller<-df$Aligner.caller

fig<-ggplot(df, aes(x = aligner_and_caller, y = f_score)) + geom_boxplot() + theme_bw() + scale_x_discrete(name = "Aligner/caller") + scale_y_continuous(name = "F-score") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
fig<-ggplot(df, aes(x = reorder(aligner_and_caller,f_score,FUN=median), y = f_score, fill=reorder(caller,f_score,FUN=median))) + geom_boxplot() + theme_bw() + scale_x_discrete(name = "Aligner/caller") + scale_y_continuous(name = "F-score") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
caller_colours<-c("red","pink","deepskyblue","gold","darkmagenta","blue","violet","orange","darkblue","green","grey","yellow","seagreen","brown","black","white","purple")

names(caller_colours) <- levels(df$Caller)
colour_scale <- scale_fill_manual(name = "Caller",values = caller_colours, breaks=levels(unique(df$Caller)))
fig<-fig + colour_scale

## Commands to create before-and-after plots showing the effect of repeat-masking on the F-score, precision and recall of 209 pipelines evaluated using real sequencing data
## This creates the unnumbered figure on page 33 of Supplementary Text 1

df<-read.table('~/varcall_evaluation/commandsForOriginalFigures/evaluationOutput/summary_statistics.tsv',sep='\t',header=T)
total_number_of_snps_no_masking<-df$Total.number.of.true.SNPs
precision_no_masking<-df$Precision..specificity.
recall_no_masking<-df$Recall..sensitivity.
f_score_no_masking<-df$F.score...2...precision..recall..precision...recall..

df<-read.table('~/varcall_evaluation/commandsForOriginalFigures/evaluationOutput-repeatmasked/summary_statistics.tsv',sep='\t',header=T)
total_number_of_snps_repeat_masking<-df$Total.number.of.true.SNPs
precision_repeat_masking<-df$Precision..specificity.
recall_repeat_masking<-df$Recall..sensitivity.
f_score_repeat_masking<-df$F.score...2...precision..recall..precision...recall..

par(mfrow=c(2,2))
plot(total_number_of_snps_no_masking,total_number_of_snps_repeat_masking,pch=20,las=1,xlab='Total no. of SNPs (no repeat masking)',ylab='Total no. of SNPs (with repeat masking)')
abline(0,1,col='red',lwd=1)
plot(f_score_no_masking,f_score_repeat_masking,pch=20,las=1,xlab='F-score (no repeat masking)',ylab='F-score (with repeat masking)')
abline(0,1,col='red',lwd=2)
plot(precision_no_masking,precision_repeat_masking,pch=20,las=1,xlab='Precision (no repeat masking)',ylab='Precision (with repeat masking)')
abline(0,1,col='red',lwd=2)
plot(recall_no_masking,recall_repeat_masking,pch=20,las=1,xlab='Recall (no repeat masking)',ylab='Recall (with repeat masking)')
abline(0,1,col='red',lwd=2)

## Commands to create scatter plots of genetic distance (either the total no. of SNPs, or Mash distance) with average F-score (or precision) across the full set of pipelines evaluated per species
## These commands create Figure 3, and Supplementary Figures 1 and 2

library(ggplot2)
library(data.table)

df1<-read.table('~/varcall_evaluation/commandsForOriginalFigures/average_f_score_for_all_pipelines_per_strain_vs_strain_uniqueness/c_difficile.average_pipeline_f_score_vs_strain_uniqueness.tsv',sep='\t',header=T)
df2<-read.table('~/varcall_evaluation/commandsForOriginalFigures/average_f_score_for_all_pipelines_per_strain_vs_strain_uniqueness/neisseria.average_pipeline_f_score_vs_strain_uniqueness.tsv',sep='\t',header=T)
df3<-read.table('~/varcall_evaluation/commandsForOriginalFigures/average_f_score_for_all_pipelines_per_strain_vs_strain_uniqueness/shigella.average_pipeline_f_score_vs_strain_uniqueness.tsv',sep='\t',header=T)
df4<-read.table('~/varcall_evaluation/commandsForOriginalFigures/average_f_score_for_all_pipelines_per_strain_vs_strain_uniqueness/streptococcus.average_pipeline_f_score_vs_strain_uniqueness.tsv',sep='\t',header=T)
df5<-read.table('~/varcall_evaluation/commandsForOriginalFigures/average_f_score_for_all_pipelines_per_strain_vs_strain_uniqueness/tb.average_pipeline_f_score_vs_strain_uniqueness.tsv',sep='\t',header=T)
df6<-read.table('~/varcall_evaluation/commandsForOriginalFigures/average_f_score_for_all_pipelines_per_strain_vs_strain_uniqueness/listeria.average_pipeline_f_score_vs_strain_uniqueness.tsv',sep='\t',header=T)
df7<-read.table('~/varcall_evaluation/commandsForOriginalFigures/average_f_score_for_all_pipelines_per_strain_vs_strain_uniqueness/e_coli.average_pipeline_f_score_vs_strain_uniqueness.tsv',sep='\t',header=T)
df8<-read.table('~/varcall_evaluation/commandsForOriginalFigures/average_f_score_for_all_pipelines_per_strain_vs_strain_uniqueness/klebsiella.average_pipeline_f_score_vs_strain_uniqueness.tsv',sep='\t',header=T)
df9<-read.table('~/varcall_evaluation/commandsForOriginalFigures/average_f_score_for_all_pipelines_per_strain_vs_strain_uniqueness/salmonella.average_pipeline_f_score_vs_strain_uniqueness.tsv',sep='\t',header=T)
df10<-read.table('~/varcall_evaluation/commandsForOriginalFigures/average_f_score_for_all_pipelines_per_strain_vs_strain_uniqueness/staphylococcus.average_pipeline_f_score_vs_strain_uniqueness.tsv',sep='\t',header=T)
df<-as.data.frame(rbindlist(list(df1,df2,df3,df4,df5,df6,df7,df8,df9,df10)))

df$Species <- gsub("c_difficile","C. difficile",df$Species)
df$Species <- gsub("neisseria","N. gonorrhoeae",df$Species)
df$Species <- gsub("shigella","S. dysenteriae",df$Species)
df$Species <- gsub("streptococcus","S. pneumoniae",df$Species)
df$Species <- gsub("tb","M. tuberculosis",df$Species)
df$Species <- gsub("listeria","L. monocytogenes",df$Species)
df$Species <- gsub("e_coli","E. coli",df$Species)
df$Species <- gsub("klebsiella","K. pneumoniae",df$Species)
df$Species <- gsub("salmonella","S. enterica",df$Species)
df$Species <- gsub("staphylococcus","S. aureus",df$Species)

## This creates Figure 3
species<-df$Species
distance<-df$Mash.distance.between.strain.and.representative.genome..i.e..an.estimate.of.the.Jaccard.index..the.fraction.of.shared.k.mers.
f_score<-df$Average.F.score
fig<-ggplot(df, aes(distance,f_score)) + geom_point(aes(colour=factor(species))) + theme_bw() + scale_x_continuous(name = "Mash distance between strain and representative genome") + scale_y_continuous(name = "Median F-score of all pipelines per strain")
fig<-fig + scale_colour_manual(values=c("red","pink","deepskyblue","gold","darkmagenta","blue","violet","orange","darkblue","green","grey"), name = "Species")
cor.test(distance,f_score,method=c('spearman'))

## This creates Supplementary Figure 1
species<-df$Species
distance<-df$Total.no..of.SNPs.identified.after.whole.genome.alignment.of.this.strain.genome.and.the.representative.genome
f_score<-df$Average.F.score
fig<-ggplot(df, aes(distance,f_score)) + geom_point(aes(colour=factor(species))) + theme_bw() + scale_x_continuous(name = "Genetic distance between strain and representative genome\n(as number of SNPs in strain relative to representative)") + scale_y_continuous(name = "Median F-score of all pipelines per strain")
fig<-fig + scale_colour_manual(values=c("red","pink","deepskyblue","gold","darkmagenta","blue","violet","orange","darkblue","green","grey"), name = "Species")
cor.test(distance,f_score,method=c('spearman'))

## This creates Supplementary Figure 2
species<-df$Species
distance<-df$Mash.distance.between.strain.and.representative.genome..i.e..an.estimate.of.the.Jaccard.index..the.fraction.of.shared.k.mers.
recall<-df$Average.recall
fig<-ggplot(df, aes(distance,recall)) + geom_point(aes(colour=factor(species))) + theme_bw() + scale_x_continuous(name = "Mash distance between strain and representative genome") + scale_y_continuous(name = "Median recall (sensitivity) of all pipelines per strain")
fig<-fig + scale_colour_manual(values=c("red","pink","deepskyblue","gold","darkmagenta","blue","violet","orange","darkblue","green","grey"), name = "Species")
cor.test(distance,recall,method=c('spearman'))