library(tidyverse)
library(ggplot2)
library(ggforce)

pca <- read_table("./PCA_with_all_samples.eigenvec", col_names = TRUE)
eigenval <- scan("./PCA_with_all_samples.eigenval")

pca <- pca[,-1]
names(pca)[1] <- "ind"
names(pca)[2:ncol(pca)] <- paste0("PC", 1:(ncol(pca)-1))

# Provide species names to samples

spp <- rep(NA, length(pca$ind))
spp[grep("aequatoriensis", pca$ind)] <- "Ipomoea aequatoriensis (ECU)"
spp[grep("CH80_3", pca$ind)] <- "Ipomoea aequatoriensis (COL)"
spp[grep("Ipomoea_batatas", pca$ind)] <- "Ipomoea batatas (6X)"
spp[grep("apiculata", pca$ind)] <- "Ipomoea batatas var. apiculata"
spp[grep("trifida", pca$ind)] <- "Ipomoea trifida"
spp[grep("tabascana", pca$ind)] <- "Ipomoea tabascana"

# Provide geographical location to samples

loc <- rep(NA, length(pca$ind))
loc[grep("PI518474", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("tabascana", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("K233", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("DFA7480", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("CH80_3", pca$ind)] <- "Colombia"
loc[grep("PI561258", pca$ind)] <- "Ecuador"
loc[grep("PI561246", pca$ind)] <- "Ecuador"
loc[grep("PI561248", pca$ind)] <- "Ecuador"
loc[grep("PI561255", pca$ind)] <- "Ecuador"
loc[grep("CIP400033", pca$ind)] <- "Bolivia_Brazil_Peru"
loc[grep("CIP400155", pca$ind)] <- "Bolivia_Brazil_Peru"
loc[grep("CIP400157", pca$ind)] <- "Bolivia_Brazil_Peru"
loc[grep("CIP400162", pca$ind)] <- "Bolivia_Brazil_Peru"
loc[grep("CIP400205", pca$ind)] <- "Bolivia_Brazil_Peru"
loc[grep("CIP400218", pca$ind)] <- "Bolivia_Brazil_Peru"
loc[grep("CIP400238", pca$ind)] <- "Bolivia_Brazil_Peru"
loc[grep("CIP400274", pca$ind)] <- "Bolivia_Brazil_Peru"
loc[grep("CIP400287", pca$ind)] <- "Bolivia_Brazil_Peru"
loc[grep("CIP400339", pca$ind)] <- "Bolivia_Brazil_Peru"
loc[grep("CIP400356", pca$ind)] <- "Colombia"
loc[grep("CIP400361", pca$ind)] <- "Colombia"
loc[grep("CIP400389", pca$ind)] <- "Colombia"
loc[grep("CIP400423", pca$ind)] <- "Colombia"
loc[grep("CIP400433", pca$ind)] <- "Colombia"
loc[grep("CIP400435", pca$ind)] <- "Colombia"
loc[grep("CIP400442", pca$ind)] <- "Colombia"
loc[grep("CIP400453", pca$ind)] <- "Colombia"
loc[grep("CIP400466", pca$ind)] <- "Colombia"
loc[grep("CIP400493", pca$ind)] <- "Colombia"
loc[grep("CIP400502", pca$ind)] <- "Colombia"
loc[grep("CIP400507", pca$ind)] <- "Colombia"
loc[grep("CIP400519", pca$ind)] <- "Colombia"
loc[grep("CIP400551", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("CIP400572", pca$ind)] <- "Caribbean"
loc[grep("CIP400784", pca$ind)] <- "Caribbean"
loc[grep("CIP400786", pca$ind)] <- "Caribbean"
loc[grep("CIP400788", pca$ind)] <- "Caribbean"
loc[grep("CIP400796", pca$ind)] <- "Caribbean"
loc[grep("CIP400803", pca$ind)] <- "Caribbean"
loc[grep("CIP400811", pca$ind)] <- "Caribbean"
loc[grep("CIP400815", pca$ind)] <- "Caribbean"
loc[grep("CIP400819", pca$ind)] <- "Caribbean"
loc[grep("CIP400822", pca$ind)] <- "Caribbean"
loc[grep("CIP400826", pca$ind)] <- "Caribbean"
loc[grep("CIP400941", pca$ind)] <- "Ecuador"
loc[grep("CIP401056", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("CIP401062", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("CIP401067", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("CIP401068", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("CIP401079", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("CIP401101", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("CIP401129", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("CIP401150", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("CIP401210", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("CIP401217", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("CIP401404", pca$ind)] <- "Venezuela"
loc[grep("CIP401443", pca$ind)] <- "Venezuela"
loc[grep("CIP401459", pca$ind)] <- "Venezuela"
loc[grep("CIP401499", pca$ind)] <- "Venezuela"
loc[grep("CIP401523", pca$ind)] <- "Venezuela"
loc[grep("CIP420068", pca$ind)] <- "Bolivia_Brazil_Peru"
loc[grep("CIP420386", pca$ind)] <- "Bolivia_Brazil_Peru"
loc[grep("CIP420602", pca$ind)] <- "Bolivia_Brazil_Peru"
loc[grep("CIP420882", pca$ind)] <- "Bolivia_Brazil_Peru"
loc[grep("CW71", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("DMC407", pca$ind)] <- "Bolivia_Brazil_Peru"
loc[grep("JRIW27946", pca$ind)] <- "Bolivia_Brazil_Peru"
loc[grep("MN37607", pca$ind)] <- "Bolivia_Brazil_Peru"
loc[grep("CH71_3", pca$ind)] <- "Ecuador"
loc[grep("CH81_2", pca$ind)] <- "Ecuador"
loc[grep("PI518479", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("CIP460021", pca$ind)] <- "Venezuela"
loc[grep("CIP460055", pca$ind)] <- "Venezuela"
loc[grep("CIP460061", pca$ind)] <- "Venezuela"
loc[grep("CIP460066", pca$ind)] <- "Venezuela"
loc[grep("CIP460096", pca$ind)] <- "Venezuela"
loc[grep("CIP460099", pca$ind)] <- "Colombia"
loc[grep("CIP460120", pca$ind)] <- "Colombia"
loc[grep("CIP460134", pca$ind)] <- "Colombia"
loc[grep("CIP460144", pca$ind)] <- "Colombia"
loc[grep("CIP460183", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("CIP460260", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("CIP460274", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("CIP460384", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("CIP460396", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("CIP460403", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("CIP460407", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("CIP460530", pca$ind)] <- "Caribbean"
loc[grep("CIP460547", pca$ind)] <- "Caribbean"
loc[grep("CIP460733", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("CIP460739", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("CIP460756", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("DLP1054", pca$ind)] <- "Colombia"
loc[grep("DLP2961", pca$ind)] <- "Mexico_and_Central_America"
loc[grep("PW108", pca$ind)] <- "Mexico_and_Central_America"

# Combine species names and geographical location for PCA

spp_loc <- paste0(spp, "_", loc)

# remake data.frame
pca <- as_tibble(data.frame(pca, spp, loc, spp_loc))

################################
### PLOTTING THE DATA ###

# Convert to percentage variance explained
pve <- data.frame(PC = 1:10, pve = eigenval/sum(eigenval)*100)

pdf("PCA_with_all_samples.pdf", width=16, height=8, onefile=T)

# make plot
a <- ggplot(pve, aes(PC, pve)) + geom_bar(stat = "identity")
a + ylab("Percentage variance explained") + theme_light()

# calculate the cumulative sum of the percentage variance explained
cumsum(pve$pve)

# Plot PCA (PC1 vs PC2)

b <- ggplot(pca, aes(PC1, PC2, col = spp))  + geom_point(size = 3) + guides(x =  guide_axis(angle = 50))
b <- b + scale_colour_manual(values = c("light blue", "blue", "green", "orange", "black", "red"))
b <- b + coord_equal() + theme_light() + stat_ellipse(type = "norm")
b + xlab(paste0("PC1 (", signif(pve$pve[1], 3), "%)")) + ylab(paste0("PC2 (", signif(pve$pve[2], 3), "%)")) + theme(axis.text=element_text(size=16), axis.title=element_text(size=20,face="bold"))


# Plot PCA (PC1 vs PC3)
c <- ggplot(pca, aes(PC1, PC3, col = spp)) + geom_point(size = 3) + guides(x =  guide_axis(angle = 50))
c <- c + scale_colour_manual(values = c("light blue", "blue", "green", "orange", "black", "red"))
c <- c + coord_equal() + theme_light() + stat_ellipse(type = "norm")
c + xlab(paste0("PC1 (", signif(pve$pve[1], 3), "%)")) + ylab(paste0("PC3 (", signif(pve$pve[3], 3), "%)")) + theme(axis.text=element_text(size=16), axis.title=element_text(size=20,face="bold"))

# Plot PCA (PC2 vs PC3)
d <- ggplot(pca, aes(PC2, PC3, col = spp)) + geom_point(size = 3) + guides(x =  guide_axis(angle = 50))
d <- d + scale_colour_manual(values = c("light blue", "blue", "green", "orange", "black", "red"))
d <- d + coord_equal() + theme_light() + stat_ellipse(type = "norm")
d + xlab(paste0("PC2 (", signif(pve$pve[2], 3), "%)")) + ylab(paste0("PC3 (", signif(pve$pve[3], 3), "%)")) + theme(axis.text=element_text(size=16), axis.title=element_text(size=20,face="bold"))

dev.off()
