# Characterise the seminal fluid proteins of a monogamous species, D. subobscura
# Compare the importance and function of proteins in D. sub and a promiscuous species, D. melanogaster

# Part 1 - use Dsub proteomic data that has been vsn normalised. Produce a list of transferred proteins (using limma differential abundnance) to be used in Part 2 of analysis

# this analysis uses all the 1389 proteins in the accessory glands and ejaculatory duct using dataset SupplementaryData1_Dsub_AG_ED_proteins_LFQ
# and the BLAST names of the 172 transferred (candidate) sfps Annotation_172Proteins.csv

#########################################################################
# load packages
#########################################################################
library(tidyverse)
library(pvclust)
library(pheatmap)
library(RColorBrewer)
library(ggplot2)
library(reshape2)
library(limma)
library(viridis) # needed for colours in heatmap
library(gridExtra)  # Needed for arranging multiple plots
library(grid)       # Needed for grid functions
library(cowplot) # for plot_grid

#########################################################################
# Read in Sex-Peptide network and sperm competition protein lists 
#########################################################################
#male-derived sex peptide network 
SP <- c("SP", "Sems", "lectin-46Cb", "lectin-46Ca", "intr", "aqrs", "CG9997", "CG17575", "antr") 

# Proteins with a role in sperm competition/post-copulatory sexual selection
# This list is from Wigby 2020 (The seminal fluid proteome and its role in post copulatory selection) Supplementary Data 2 - Dmel sfp Human hits - role in Dmel Post Copulatory Sexual Selection (PCSS) column
SCgenes <-c("Acp29AB", "Acp33A", "Acp36DE", "Acp53Ea", "Acp62F", "Acp76A", "CG17575", "CG31872", "CG9997", "SP", "Spn28F", "lectin-46Ca", "lectin-46Cb", "msopa") #14 sp comp proteins 

#########################################################################
### 1. Compare mated and unmated samples
#########################################################################
# Abundance data, 5 samples per mated and unmated treatments, 1389 proteins
d <- read.csv("/Users/#pathway#/SupplementaryData1_Dsub_AG_ED_proteins_LFQ.csv")
# analysis just needs expression levels
dataEachSample <- d[,c("mated_01", "mated_02", "mated_03", "mated_04", "mated_05", "unmated_01", "unmated_02", "unmated_03", "unmated_04", "unmated_05")]
###########################
# PCA analysis
###########################
# we first want to test whether mating causes differences in the proteome in the ejaculatory duct and accessory gland - we will do this using PCA and cluster analysis

# Transpose data: samples as rows
dataTransposed <- t(dataEachSample)

# Perform PCA (scale. = TRUE is generally recommended)
data.pca <- prcomp(dataTransposed, scale. = TRUE)

# Extract PC1 and PC2 scores 
# scores = data.pca$x - location of each sample in space, visualise how each sample clusters by treatment
pca_df <- as.data.frame(data.pca$x[, 1:2])
colnames(pca_df) <- c("PC1", "PC2")

# Add treatment labels based on sample names
sample_names <- rownames(pca_df)
pca_df$Sample <- ifelse(grepl("^mated", sample_names), "mated", "unmated")

# Variance explained by each PC
explained_var <- data.pca$sdev^2 / sum(data.pca$sdev^2)

# Percentage for PC1 and PC2
pc1_var <- explained_var[1] * 100
pc2_var <- explained_var[2] * 100

# Plot with ellipses - Supplementary Figure 1A
p.PCA <- ggplot(pca_df, aes(x = PC1, y = PC2, color = Sample)) +
  scale_color_manual(values=c("grey60", "black")) +
  geom_point(size = 3) +
  stat_ellipse(type = "t", linetype = 2) +  # normal ellipse or more robust (t)
  labs(x = "PC1 (33.6% variance)", y = "PC2 (18.6% variance)") +
  theme_classic() + 
  theme(legend.position = c(0.8, 0.85), axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14), axis.title = element_text(size = 14))

p.PCA
###########################
# Clustering analysis
###########################
results <- pvclust(dataEachSample, method.hclust = 'average', method.dist = 'euclidean', nboot = 1000)
plot(results, hang = -1, print.num = FALSE)
pvrect(results, alpha=0.95)
#mated and unmated samples cluster into two different groups - unmated_02 is different from unmated_1, 3, 4 & 5 which cluster together, mated_01 & mated_04 are closer than mated_02 which are closer than mated_03 and 05.
#########################################################################
# Calculate differential abundance between unmated and mated samples
#########################################################################
# which proteins have significantly different abundance between mated and unmated males - these proteins are likely to have been transferred during mating and so are candidate seminal fluid proteins. We will use limma

# Create the design matrix from the sample information
design <- model.matrix(~-1 + factor(rep(1:2, each=5)))

colnames(design) <- c("mated","unmated")
head(design)
# equation is unmated - mated i.e. proteins transferred have positive LogFC values
contrast <- makeContrasts(unmated - mated, levels = design)
# Apply linear model to data
fit <- lmFit(d, design)
fit2 <- contrasts.fit(fit, contrast)
# Apply empirical Bayes to smooth standard errors
fit2 <- eBayes(fit2)
# Apply multiple testing correction and obtain stats
diffExpLimma <- topTable(fit2, number = nrow(d), adjust.method = "BH") #1389 proteins 
dim(diffExpLimma) #1389 proteins

# run limma diagnostics
plotMD(fit2) # differentially expressed proteins above and below x=0
abline(0,0, col='blue')
plotSA(fit2, main="Standard deviation vs. average log expression") #blue line represents eBayes

# identify differential abundance proteins
diffExpLimma$DE <- ifelse(diffExpLimma$adj.P.Val < 0.05, "DE", "not")
# identify protein with higher abundance in unmated males
diffExpLimma$Dirn <- ifelse(diffExpLimma$logFC > 0, "Up", "down")
# identify transferred proteins (both significant and higher in unmated)
diffExpLimma$SFprotein <- as.factor(
  ifelse(diffExpLimma$DE == 'DE' & diffExpLimma$Dirn == 'Up', "transferred",
  ifelse(diffExpLimma$DE == 'DE' & diffExpLimma$Dirn == "down", "mated up", 'not different')))

table(diffExpLimma$DE) # 256 differentially expressed proteins
table(diffExpLimma$SFprotein) # 172 transferred proteins

diffExpLimma$log_adj_pval <- -log10(diffExpLimma$adj.P.Val)

# now I need to merge the Limma/DE stats with the original abundance values
masterd <- merge(d, diffExpLimma, by.x = "X", by.y = "X", all.x = TRUE)

# rank all proteins 1 to 1389 for whole data ranked abundance plot
#get mean value per protein in unmated samples
ColsToAv <- c("unmated_01", "unmated_02","unmated_03", "unmated_04", "unmated_05")
masterd$Av_unmated_allproteins <- apply(masterd[,ColsToAv], 1, mean)

# order by value of av_unmated
ord <- arrange(masterd, desc(Av_unmated_allproteins)) %>%
  mutate(rank_allproteins = 1:nrow(masterd)) #1389 proteins

ord_t <- subset(ord, SFprotein == "transferred") #172 proteins

#########################################################################
# annotate dataset - protein names - Detected in official + transcriptome DB
AnnotationList <- read.csv("/Users/#pathway#/SupplementaryData/Annotation_172Proteins.csv") #172 proteins

ord_t_anno <- merge(ord_t, AnnotationList, by.x = "X", by.y = "transcriptID", all.x = TRUE)
#########################################################################
# annotate proteins for SPN and post-copulatory sexual selection
ord_t_anno$SPN <- ifelse(ord_t_anno$gene.y_BLASTPtoDmel %in% SP, "SPN", "other") 
ord_t_anno$PCSS <- ifelse(ord_t_anno$gene.y_BLASTPtoDmel %in% SCgenes, "PCSS", "other") #post-cop sex sel

# Get unique names for protein IDs in Dsub - if there is <NA> it means there is no homology to Dmel
ord_t_anno$gene.y_BLASTPtoDmel <- as.character(ord_t_anno$gene.y_BLASTPtoDmel)
ord_t_anno$gene.y_BLASTPtoDmel[is.na(ord_t_anno$gene.y_BLASTPtoDmel)] <- "no_homology"
# some proteins map to a single Dmel protein so we need to re-name these
ord_t_anno$protein_unique <- make.unique(as.character(ord_t_anno$gene.y_BLASTPtoDmel))

#################################
# write data 
#################################
# Annotated proteins with abundance levels and differential expression data and rank. This dataset will be used for part 2 of the analysis which explores the seminal fluid proteins of Dsub
#write.csv(ord_t_anno, "/Users/#pathway#/SupplementaryData2_Dsub_sfp_abundance_172transferred.csv")
#########################################################################
# All proteins
#########################################################################
# we are working with the 1389 proteins in the AG & ED
data_all <- ord

#########################################################################
# Volcano plots - Supplementary Figure 1C
#########################################################################
# logFC on x, -log10(padjust) on y, transferred during mating are up in unmated, horizontal line for padjust < 0.05. We will use a volcano plot to visulaise the proteins that are transferred during mating
p.vol <- ggplot() +
  geom_point(data = data_all, aes(x = logFC, y = log_adj_pval, colour = SFprotein)) +
  scale_color_manual(values = c("#0072B2", "#4D4D4D", "#D55E00")) +
  theme_classic() +
  ylab("log10 adj p-value") +
  geom_abline(slope=0, intercept=-log10(0.05)) +
  theme(legend.position="none", axis.title = element_text(size = 14), 
        axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14)) +
  annotate("text", x=1.2, y=6.2, label= "higher in unmated") +
  annotate("text", x=-1.2, y=6.2, label= "higher in mated") +
  geom_segment(aes(x = 0.4, y = 6, xend = 2, yend = 6),
               arrow = arrow(length = unit(0.2, "cm"))) +
  geom_segment(aes(x = -0.5, y = 6, xend = -2, yend = 6),
               arrow = arrow(length = unit(0.2, "cm")))

p.vol

#########################################################################
# Ranked abundance of all proteins with transferred proteins in orange 
#########################################################################
# visualise abundance of transferred proteins in unmated samples. Remove data for mated abundance samples
ord_unmated <- data_all[,c("X", "unmated_01", "unmated_02", "unmated_03", "unmated_04", "unmated_05", "AveExpr",  "SFprotein", "log_adj_pval", "Av_unmated_allproteins", "rank_allproteins")]

# melt data to get protein, rank order, variable (ie mated or unmated) and value (ie abundance)
meltorder <- melt(ord_unmated, id=c("X", "AveExpr", "SFprotein", "log_adj_pval", "Av_unmated_allproteins", "rank_allproteins"))
# 6945 lines, ie 5 lines per protein, 1389 proteins

# all the proteins - hard to see the colours so outlines also changed
bxpl <- ggplot(meltorder, aes(x = reorder(rank_allproteins, -value), y = value, fill = SFprotein, colour = SFprotein)) +
  geom_boxplot(aes(group = cut_width(rank_allproteins, 1))) + scale_alpha_manual(values=c(0.6,0.6,0.6)) +
  scale_fill_manual(values = c("mated up" = "#0072B2", 
                               "not different" = "grey", 
                               "transferred" = "#D55E00")) + 
  scale_colour_manual(values = c("mated up" = "#0072B9", 
                                 "not different" = "grey50", 
                                 "transferred" = "#D55E00")) + 
  xlab("Ranked abundance") +  
  ylab("Mean abundance") +
  scale_x_discrete(breaks = c(0, 500, 1000, 1500), labels = c("0", "500", "1000", "1500")) +
  scale_y_continuous(limits = c(10, 30)) +   
  theme_bw() +  
  theme(legend.position = c(0.8, 0.85))

bxpl

# meltorder has 6945 lines, 5 samples per 1389 proteins
nd_meltorder <- subset(meltorder, !duplicated(rank_allproteins)) #1389 proteins
kruskal.test(rank_allproteins ~ SFprotein, data = nd_meltorder) # chi-squared = 209.79
#install.packages("FSA")
library(FSA)
# post-hoc test for transferred vs not different
dunnTest(nd_meltorder$rank_allproteins, nd_meltorder$SFprotein)

#########################################################################
# Supplementary Figure - Plot ranked abundance with transferred proteins in orange
#########################################################################
# plot ranked order boxplot on its own
#dev.new(width = 12, height = 5, noRStudioGD = TRUE)
#plot(bxpl)

#########################################################################
# Heatmap plot 1300 proteins and highlight where the significant DE proteins are
#########################################################################
# i need two dataframes with the same row names, one for abundance levels, one to denote the DE proteins
# all proteins with DE pvals
#1389 proteins with rank 1:1389 calculated from unmated samples
data_all$SFprotein <- as.factor(data_all$SFprotein)
data_all$Protein <- data_all$SFprotein

data_all$proteinID <- paste("p", data_all$rank_allproteins, sep=".")
rownames(data_all) <- data_all$proteinID
allheatmap <- data_all[,c("mated_01", "mated_02", "mated_03", "mated_04", "mated_05", "unmated_01", "unmated_02", "unmated_03", "unmated_04", "unmated_05")]

Protein <- data_all[,"Protein"]
allDE <- as.data.frame(Protein) #make sure the rownames of allDE match that of allheatmap (ie p.1, p.2)
rownames(allDE) <- data_all$proteinID

# all these need to match
#rownames(allheatmap)
#rownames(allDE)

# Replace with distinct colors
ann_colors <- list(
  Protein = c("mated up" = "#0072B2",
              "not different" = "#4D4D4D", 
              "transferred" = "#D55E00")
)

myheatmap_all <- pheatmap(allheatmap,
                          cluster_row=T, cluster_cols=T, 
                          show_rownames=FALSE, fontsize_row = 7, 
                          annotation_row=allDE, 
                          annotation_colors = ann_colors, #apply custom colours
                          show_colnames=T, 
                          legend = T,
                          fontsize = 11,
                          cex = 1,
                          clustering_method='average', color = inferno(500))

# Use function to Plot Supplementary Figure - PCA, volcano plot and heatmap of all proteins
#save_plots_and_pheatmap_pdf(p.PCA, p.vol, myheatmap_all, "/Users/#pathway#/1.Analysis/Figure2_PCA_volcano_heatmap.pdf")

########################################################################
# Function - save heatmaps and plots in one figure
########################################################################
save_plots_and_pheatmap_pdf <- function(plot1, plot2, plot3, filename, width=12, height=12) {
  stopifnot(!missing(plot1))
  stopifnot(!missing(plot2))
  stopifnot(!missing(filename))
  
  # Open PDF file
  pdf(filename, width=width, height=height)  
  grid::grid.newpage()
  
  #col1 <- align_plots(plot1, plot2, align = 'v') # pca and vocano same height
  col1 <- plot_grid(plot1, plot2, ncol=1, rel_heights = c(1,1), labels = c("A", "C")) #diff heights
  
  # Combine plots with labels
  combined_plot <- plot_grid(
    col1, plot3$gtable,
    rel_widths = c(1,1),
    labels = c("", "B"),  # Add labels
    label_size = 14,        # Adjust label size
    ncol = 2               # Arrange side by side
  )
  
  grid.draw(combined_plot)  # Draw the combined plot with labels
  dev.off()  # Close the PDF file
}

########################################################################