iPSC splicing

Read in the splicing results returned by MAJIQ and make a volcano plot, only highlight genes of interest with a label.

ipsc_splicing = fread(file.path(here::here(),"data","ipsc_splicing_results.csv"))
ipsc_de = fread(file.path(here::here(),"data","ipsc_differential_expression.csv"))


splicing_dots_tables <- ipsc_splicing %>%
    mutate(junction_name = case_when(gene_name %in% c("UNC13A","AGRN",
                                                      "UNC13B","PFKP","SETD5",
                                                      "ATG4B","STMN2") &
                                         p_d_psi_0_10_per_lsv_junction > 0.9  &
                                         deltaPSI > 0 ~ gene_name,
                                     T ~ "")) %>%
    mutate(`Novel Junction` = de_novo_junctions == 0) %>%
    mutate(log10_test_stat = -log10(1 - p_d_psi_0_10_per_lsv_junction)) %>%
    mutate(log10_test_stat = ifelse(is.infinite(log10_test_stat), 16, log10_test_stat)) %>%
    mutate(graph_alpha = ifelse(p_d_psi_0_10_per_lsv_junction > 0.9, 1, 0.2)) %>%
    mutate(label_junction = case_when(gene_name %in% c("UNC13A","AGRN",
                                                       "UNC13B","PFKP","SETD5",
                                                       "ATG4B","STMN2") &
                                          p_d_psi_0_10_per_lsv_junction > 0.9  &
                                          deltaPSI > 0 ~ junction_name,
                                      T ~ ""))

fig1a = ggplot() +
    geom_point(data = splicing_dots_tables %>% filter(de_novo_junctions != 0),
               aes(x = deltaPSI, y =log10_test_stat,
                   alpha = graph_alpha,,fill = "Annotated Junction"), pch = 21, size = 10) +
    geom_point(data = splicing_dots_tables %>% filter(de_novo_junctions == 0),
               aes(x = deltaPSI, y =log10_test_stat,
                   alpha = graph_alpha,fill = "Novel Junction"), pch = 21, size = 10) +
    geom_text_repel(data = splicing_dots_tables[junction_name != ""],
                    aes(x = deltaPSI, y =log10_test_stat,
                        label = label_junction,
                        color = as.character(de_novo_junctions)), point.padding = 0.3,
                    nudge_y = 0.2,
                    min.segment.length = 0,
                    box.padding  = 2,
                    size=6,show.legend = F) +
    geom_hline(yintercept = -log10(1 - .9)) +
    geom_vline(xintercept = -0,linetype="dotted") +
    scale_fill_manual(name = "",
                      breaks = c("Annotated Junction", "Novel Junction"),
                      values = c("Annotated Junction" = "#648FFF", "Novel Junction" = "#fe6101") ) +
    scale_color_manual(name = "",
                       breaks = c("0", "1"),
                       values = c("1" = "#648FFF", "0" = "#fe6101") ) +
    guides(alpha = FALSE, size = FALSE) +
    theme(legend.position = 'top') +
    ggpubr::theme_pubr() +
    xlab("delta PSI") +
    ylab(expression(paste("-Lo", g[10], " Test Statistic"))) +
    theme(text = element_text(size = 24)) +
    theme(legend.text=element_text(size=22)) +
    xlim(-1,1) +
    scale_x_continuous(labels = scales::percent)
Scale for 'x' is already present. Adding another scale for 'x', which will
replace the existing scale.
de_table = ipsc_de %>%
    mutate(contains_cryptic = gene_name %in% splicing_dots_tables[cryptic_junction == T,unique(gene_name)]) %>%
    mutate(contains_cryptic = as.character(as.numeric(contains_cryptic))) %>%
    mutate(label_junction = case_when(gene_name %in% c("UNC13A","AGRN",
                                                       "UNC13B","PFKP","SETD5",
                                                       "ATG4B","STMN2","TARDBP") ~ gene_name,
                                      T ~ "")) %>%
    mutate(graph_alpha = ifelse(padj < 0.1, 1, 0.2)) %>%
    mutate(y_data = -log10(padj))


fig1b = ggplot(data = de_table) +
    geom_point(data = de_table %>% filter(contains_cryptic == "0"),
               aes(x = log2FoldChange, y = -log10(padj),
                   alpha = graph_alpha,fill = "No Cryptic"), pch = 21, size = 10) +
    geom_point(data = de_table %>% filter(contains_cryptic == "1"),
               aes(x = log2FoldChange, y = -log10(padj),
                   alpha = graph_alpha,fill = "Contains Cryptic"), pch = 21, size = 10)+
    geom_text_repel(data = de_table[label_junction != ""],max.overlaps = 500,
                    aes(x = log2FoldChange,
                        y = -log10(padj),
                        label = label_junction,
                        color = as.character(contains_cryptic)),
                    nudge_y = 5,
                    min.segment.length = 0,
                    box.padding  = 4,
                    size=6,show.legend = F) +
    scale_fill_manual(name = "",
                      breaks = c("No Cryptic", "Contains Cryptic"),
                      values = c("No Cryptic" = "#648FFF", "Contains Cryptic" = "#fe6101") ) +
    scale_color_manual(name = "",
                       breaks = c("1", "0"),
                       values = c("1" = "#fe6101", "0" = "#648FFF") ) +
    xlim(-7.5,7.5) +
    ylab(expression(paste("-Lo", g[10], " P-value"))) +
    guides(alpha = FALSE, size = FALSE) +
    theme(legend.position = 'top') +
    ggpubr::theme_pubr() +
    xlab(expression(paste("Lo", g[2], " Fold Change"))) +
    theme(text = element_text(size = 24)) +
    theme(legend.text=element_text(size=22)) +
    geom_hline(yintercept = -log10(0.1))  +
    geom_vline(xintercept=c(0), linetype="dotted")

print(fig1a)

print(fig1b)

UNC13A CE PSI across TDP-43 KD experiments

The ‘C/G’ tells which genotypes were supported by RNA-seq on rs12973192. The SK-N-DZ cell lines are het, as is the WTC11 cell line. SH-SY5Y cells are homozygote for the major allele. There was variability on the Klim hMN set on allelic expression.

rel_rna_cryptic_amount = data.table::fread(file.path(here::here(),"data","kd_experiments_relative_rna_and_unc13a_cryptic_junction_counts.csv"))
rel_rna_cryptic_amount[,cryptic_psi_full := ( UNC13A_3prime +
                                                  UNC13A_5prime +  UNC13A_5prime_2 +
                                                  UNC13A_5prime_3) / (UNC13A_annotated +  UNC13A_3prime +
                                                                          UNC13A_5prime +  UNC13A_5prime_2 +
                                                                          UNC13A_5prime_3)]

barplot UNC13A CE PSI - full five

rel_rna_cryptic_amount %>%
    ggbarplot(,
              x = "source",
              add = c("mean_se","jitter"),
              y = "cryptic_psi_full",
              fill = 'condition',
              color = 'condition',
              position = position_dodge(0.8)) +
    ggpubr::theme_pubr() +
    scale_fill_manual(name = "",
                      values = c("#40B0A6","#E1BE6A")
    ) +
    scale_color_manual(name = "",
                       values = c("#1C2617","#262114")
    ) +
    ylab("UNC13A CE PSI") +
    xlab("") +
    guides(color = FALSE) +
    scale_y_continuous(labels = scales::percent) +
    theme(text = element_text(size = 20,family = 'sans'),
          legend.text = element_text(size = 36,family = 'sans'),
          axis.title.y = element_text(size = 28),
          axis.text.y = element_text(size = 28))

barplot UNC13B frameshift PSI

rel_rna_cryptic_amount %>%
    ggbarplot(,
              x = "condition",
              add = c("mean_se","jitter"),
              y = "unc13b_nmd_exon_psi",
              fill = 'condition',
              color = 'condition',
              position = position_dodge(0.8),facet.by = 'source') +
    ggpubr::theme_pubr() +
    scale_fill_manual(name = "",
                      values = c("#40B0A6","#E1BE6A")
    ) +
    scale_color_manual(name = "",
                       values = c("#1C2617","#262114")
    ) +
    ylab("UNC13B \nNMD Exon PSI") +
    xlab("") +
    guides(color = FALSE) +
    theme(text = element_text(size = 20,family = 'sans'),
          legend.text = element_text(size = 36,family = 'sans'),
          axis.title.y = element_text(size = 28),
          axis.text.y = element_text(size = 28)) +
    facet_wrap(~source) +
    stat_compare_means() +
    scale_y_continuous(labels = scales::percent)

TARDBP RNA and UNC13A Cryptic

rel_rna_cryptic_amount %>%
    filter(condition != "control") %>%
    ggplot() +
    geom_point(aes(x = TARDBP, y = cryptic_psi_full, fill = source),pch = 21,size = 4) +
    stat_cor(aes(x = TARDBP, y = cryptic_psi_full),size = 12,method = 's',cor.coef.name = 'rho') +
    geom_smooth(aes(x = TARDBP, y = cryptic_psi_full),color = "black",method = 'lm') +
    ggpubr::theme_pubr() +
    theme(text = element_text(size = 20)) +
    scale_x_continuous(labels = scales::percent) +
    ylab("UNC13A CE PSI") +
    theme(legend.title=element_blank()) +
    theme(legend.position = 'bottom') +
    scale_y_continuous(labels = scales::percent)
`geom_smooth()` using formula 'y ~ x'

UNC13A RNA and UNC13A Cryptic

rel_rna_cryptic_amount %>%
    filter(condition != "control") %>%
    ggplot() +
    geom_point(aes(x = UNC13A, y = cryptic_psi_full, fill = source),pch = 21,size = 6) +
    stat_cor(aes(x = UNC13A, y = cryptic_psi_full),size = 12,
             method = 'spearman',
             cor.coef.name = 'rho') +
    geom_smooth(aes(x = UNC13A, y = cryptic_psi_full),color = "black",method = 'lm') +
    ggpubr::theme_pubr() +
    theme(text = element_text(size = 20)) +
    scale_x_continuous(labels = scales::percent) +
    scale_y_continuous(labels = scales::percent) +
    ylab("UNC13A CE PSI") +
    expand_limits(y = 1) +
    theme(legend.title=element_blank()) +
    theme(legend.position = 'bottom') +
    theme(text = element_text(size = 18,family = 'sans'),
          legend.text = element_text(size = 20,family = 'sans'),
          axis.title = element_text(size = 32),
          axis.text = element_text(size = 32))
`geom_smooth()` using formula 'y ~ x'

## UNC13A RNA and UNC13A IR

rel_rna_cryptic_amount %>%
    filter(condition != "control") %>%
    ggplot() +
    geom_point(aes(x = UNC13A, y = normalized_unc13a_ir, fill = source),pch = 21,size = 4) +
    stat_cor(aes(x = UNC13A, y = normalized_unc13a_ir),size = 12,cor.coef.name = 'rho',method = 's') +
    geom_smooth(aes(x = UNC13A, y = normalized_unc13a_ir),color = "black",method = 'lm') +
    ggpubr::theme_pubr() +
    theme(text = element_text(size = 20)) +
    scale_x_continuous(labels = scales::percent) +
    ylab("UNC13A Normalized IR") +
    theme(legend.title=element_blank()) +
    theme(legend.position = 'bottom')
`geom_smooth()` using formula 'y ~ x'

## TARDBP RNA and UNC13A IR

rel_rna_cryptic_amount %>%
    filter(condition != "control") %>%
    ggplot() +
    geom_point(aes(x = TARDBP, y = normalized_unc13a_ir, fill = source),pch = 21,size = 4) +
    stat_cor(aes(x = TARDBP, y = normalized_unc13a_ir),size = 12) +
    geom_smooth(aes(x = TARDBP, y = normalized_unc13a_ir),color = "black",method = 'lm') +
    ggpubr::theme_pubr() +
    theme(text = element_text(size = 20)) +
    scale_x_continuous(labels = scales::percent) +
    ylab("UNC13A Normalized IR") +
    theme(legend.title=element_blank()) +
    theme(legend.position = 'bottom')
`geom_smooth()` using formula 'y ~ x'

barplot UNC13A cryptic PSI

rel_rna_cryptic_amount %>%
    ggbarplot(,
              x = "source",
              add = c("mean_se","jitter"),
              y = "cryptic_psi_full",
              fill = 'condition',
              color = 'condition',
              position = position_dodge(0.8)) +
    ggpubr::theme_pubr() +
    scale_fill_manual(
        values = c("#40B0A6","#E1BE6A")
    ) +
    scale_color_manual(
        values = c("#1C2617","#262114")
    ) +
    ylab("UNC13A CE PSI") +
    xlab("") +
    guides(color = FALSE)

barplot STMN2 cryptic PSI

stmn2 = rel_rna_cryptic_amount %>%
    ggbarplot(,
              x = "source",
              add = c("mean_se","jitter"),
              y = "stmn_2_cryptic_psi",
              fill = 'condition',
              color = 'condition',
              position = position_dodge(0.8)) +
    ggpubr::theme_pubr() +
    scale_fill_manual(
        values = c("#40B0A6","#E1BE6A")
    ) +
    scale_color_manual(
        values = c("#1C2617","#262114")
    ) +
    ylab("STMN2 Cryptic PSI") +
    xlab("") +
    guides(color = FALSE)
print(stmn2)

barplot UNC13B normalized IR

unc13b_ir = rel_rna_cryptic_amount %>%
    ggbarplot(,
              x = "source",
              add = c("mean_se","jitter"),
              y = "normalized_unc13b_ir",
              fill = 'condition',
              color = 'condition',
              position = position_dodge(0.8)) +
    ggpubr::theme_pubr() +
    scale_fill_manual(
        values = c("#40B0A6","#E1BE6A")
    ) +
    scale_color_manual(
        values = c("#1C2617","#262114")
    ) +
    ylab("Normalized UNC13B \nIntron Retention Ratio") +
    xlab("") +
    guides(color = FALSE) +
    theme(legend.position = "none") +
    theme(text = element_text(size = 18)) +
    guides(color = FALSE) +
    theme(text = element_text(size = 20,family = 'sans'),
          legend.text = element_text(size = 36,family = 'sans'),
          axis.title.y = element_text(size = 28),
          axis.text.y = element_text(size = 28))

print(unc13b_ir)

barplot UNC13A normalized IR

unc13a_ir = rel_rna_cryptic_amount %>%
    ggbarplot(,
              x = "source",
              add = c("mean_se","jitter"),
              y = "normalized_unc13a_ir",
              fill = 'condition',
              color = 'condition',
              position = position_dodge(0.8)) +
    ggpubr::theme_pubr() +
    scale_fill_manual(
        values = c("#40B0A6","#E1BE6A")
    ) +
    scale_color_manual(
        values = c("#1C2617","#262114")
    ) +
    ylab("Normalized UNC13A \nIntron Retention Ratio") +
    xlab("") +
    theme(legend.position = "none") +
    guides(color = FALSE) +
    theme(text = element_text(size = 20,family = 'sans'),
          legend.text = element_text(size = 36,family = 'sans'),
          axis.title.y = element_text(size = 28),
          axis.text.y = element_text(size = 28))

barplot UNC13A normalized RNA Level

text_table = fread(file.path(here::here(),"data","deseq2_cellline_fold_change.csv"))

rel_rna_cryptic_amount %>%
    ggbarplot(,
              x = "source",
              add = c("mean_se","jitter"),
              y = "UNC13A",
              fill = 'condition',
              color = 'condition',
              position = position_dodge(0.8)) +
    ggpubr::theme_pubr() +
    scale_fill_manual(
        values = c("#40B0A6","#E1BE6A")
    ) +
    scale_color_manual(
        values = c("#1C2617","#262114")
    ) +
    ylab("UNC13A") +
    xlab("") +
    theme(legend.position = "none") +
    guides(color = FALSE) +
    theme(text = element_text(size = 20,family = 'sans'),
          legend.text = element_text(size = 36,family = 'sans'),
          axis.title.y = element_text(size = 28),
          axis.text.y = element_text(size = 28)) +
    scale_y_continuous(labels = scales::percent) +
    geom_text(data = text_table[gene_name == "UNC13A"],aes(x = source, y= 1.35,label = padj_plot),size = 8) +
    geom_text(data = text_table[gene_name == "UNC13A"],aes(x = source, y= 1.2,label = log2FoldChange_plot),size = 8)

barplot UNC13B normalized RNA Level

text_table = fread(file.path(here::here(),"data","deseq2_cellline_fold_change.csv"))


rel_rna_cryptic_amount %>%
  left_join(text_table[,.(plot_name,source)]) %>% 
  unique() %>% 
  mutate(plot_name = fct_relevel(plot_name,"iPSC MN","SH-SY5Y","I3 Neurons","SK-N-DZ_a")) %>% 
    ggbarplot(,
              x = "plot_name",
              add = c("mean_se","jitter"),
              y = "UNC13B",
              fill = 'condition',
              color = 'condition',
              position = position_dodge(0.8)) +
    ggpubr::theme_pubr() +
    scale_fill_manual(
        values = c("#40B0A6","#E1BE6A")
    ) +
    scale_color_manual(
        values = c("#1C2617","#262114")
    ) +
    ylab("UNC13B") +
    xlab("") +
    theme(legend.position = "none") +
    guides(color = FALSE) +
    theme(text = element_text(size = 20,family = 'sans'),
          legend.text = element_text(size = 36,family = 'sans'),
          axis.title.y = element_text(size = 28),
          axis.text.y = element_text(size = 28)) +
    scale_y_continuous(labels = scales::percent,expand = expansion(mult = c(0, .1))) +
    geom_text(data = text_table[gene_name == "UNC13B"],aes(x = plot_name, y= 1.35,label = padj_plot),size = 8) +
    geom_text(data = text_table[gene_name == "UNC13B"],aes(x = plot_name, y= 1.2,label = log2FoldChange_plot),size = 8)
Joining, by = c("source", "plot_name")

barplot TARDBP normalized RNA Level

text_table = fread(file.path(here::here(),"data","deseq2_cellline_fold_change.csv"))


rel_rna_cryptic_amount %>%
  left_join(text_table[,.(plot_name,source)]) %>% 
  unique() %>% 
  mutate(plot_name = fct_relevel(plot_name,"iPSC MN","SH-SY5Y","I3 Neurons","SK-N-DZ_a")) %>% 
    ggbarplot(,
              x = "plot_name",
              add = c("mean_se","jitter"),
              y = "TARDBP",
              fill = 'condition',
              color = 'condition',
              position = position_dodge(0.8)) +
    ggpubr::theme_pubr() +
    scale_fill_manual(
        values = c("#40B0A6","#E1BE6A")
    ) +
    scale_color_manual(
        values = c("#1C2617","#262114")
    ) +
    ylab("TARDBP") +
    xlab("") +
    theme(legend.position = "none") +
    guides(color = FALSE) +
    theme(text = element_text(size = 20,family = 'sans'),
          legend.text = element_text(size = 36,family = 'sans'),
          axis.title.y = element_text(size = 28),
          axis.text.y = element_text(size = 28)) +
    scale_y_continuous(labels = scales::percent,expand = expansion(mult = c(0, .1))) +
    geom_text(data = text_table[gene_name == "TARDBP"],aes(x = plot_name, y= 1.35,label = padj_plot),size = 8) +
    geom_text(data = text_table[gene_name == "TARDBP"],aes(x = plot_name, y= 1.2,label = log2FoldChange_plot),size = 8)
Joining, by = c("source", "plot_name")

## Scatterplot stmn2 normalized TARDBP

rel_rna_cryptic_amount %>%
    filter(condition != "control") %>%
    ggplot() +
    geom_point(aes(x = TARDBP, y = stmn_2_cryptic_psi, fill = source),pch = 21,size = 4) +
    stat_cor(aes(x = TARDBP, y = stmn_2_cryptic_psi)) +
    theme(legend.position = 'bottom')

Scatterplot UNC13BIR normalized TARDBP

rel_rna_cryptic_amount %>%
    filter(condition != "control") %>%
    ggplot() +
    geom_point(aes(x = TARDBP, y = normalized_unc13b_ir, fill = source),pch = 21,size = 4) +
    stat_cor(aes(x = TARDBP, y = normalized_unc13b_ir)) +
    theme(legend.position = 'bottom')

Scatterplot UNC13A-IR and UNC13A Crptic

rel_rna_cryptic_amount %>%
    filter(condition != "control") %>%
    ggplot() +
    geom_point(aes(x = cryptic_psi_full,
                   y = normalized_unc13a_ir, fill = source),pch = 21,size = 4) +
    stat_cor(aes(x = cryptic_psi_full,
                 y = normalized_unc13a_ir)) +
    theme(legend.position = 'bottom')

Scatterplot UNC13B-IR and UNC13A Cryptic

rel_rna_cryptic_amount %>%
    filter(condition != "control") %>%
    ggplot() +
    geom_point(aes(x = cryptic_psi_full, y = normalized_unc13b_ir, fill = source),pch = 21,size = 4) +
    stat_cor(aes(x = cryptic_psi_full, y = normalized_unc13b_ir)) +
    theme(legend.position = 'bottom')+
    facet_wrap(~source)

scatterplot UNC13BIR and UNC13B NMD

rel_rna_cryptic_amount %>%
    filter(condition != "control") %>%
    ggplot() +
    geom_point(aes(x = unc13b_nmd_exon_psi, y = normalized_unc13b_ir, fill = source),pch = 21,size = 4) +
    stat_cor(aes(x = unc13b_nmd_exon_psi, y = normalized_unc13b_ir)) +
    theme(legend.position = 'bottom')+
    facet_wrap(~source)

scatterplot UNC13A CE and UNC13B NMD

rel_rna_cryptic_amount %>%
    filter(condition != "control") %>%
    ggplot() +
    geom_point(aes(x = cryptic_psi, y = normalized_unc13b_ir, fill = source),pch = 21,size = 4) +
    stat_cor(aes(x = unc13b_nmd_exon_psi, y = normalized_unc13b_ir)) +
    theme(legend.position = 'bottom')+
    facet_wrap(~source)

# NMD experiment on SH-SY5Y cells - CHX

for_bar_plot_input <- fread(file.path(here::here(),"data","SY5Y_Tidy_4.csv"))
for_bar_plot <- data.frame(for_bar_plot_input)
for_bar_plot$Treatment <- factor(for_bar_plot$Treatment, levels = c("DMSO", "CHX"))

shapiro.test(filter(for_bar_plot, Treatment == "CHX" & Gene == "HNRNPL")$Value)

    Shapiro-Wilk normality test

data:  filter(for_bar_plot, Treatment == "CHX" & Gene == "HNRNPL")$Value
W = 0.87027, p-value = 0.1866
shapiro.test(filter(for_bar_plot, Treatment == "CHX" & Gene == "STMN2")$Value)

    Shapiro-Wilk normality test

data:  filter(for_bar_plot, Treatment == "CHX" & Gene == "STMN2")$Value
W = 0.89488, p-value = 0.3011
shapiro.test(filter(for_bar_plot, Treatment == "CHX" & Gene == "UNC13A")$Value)

    Shapiro-Wilk normality test

data:  filter(for_bar_plot, Treatment == "CHX" & Gene == "UNC13A")$Value
W = 0.95746, p-value = 0.7967
shapiro.test(filter(for_bar_plot, Treatment == "CHX" & Gene == "UNC13B")$Value)

    Shapiro-Wilk normality test

data:  filter(for_bar_plot, Treatment == "CHX" & Gene == "UNC13B")$Value
W = 0.89514, p-value = 0.261
stattest <- for_bar_plot %>%
  group_by(Gene) %>%
  pairwise_t_test(Value ~ Treatment, ref.group = "DMSO", alternative = "greater") %>%
  add_significance() %>% 
  add_xy_position(x="Treatment")
stattest
# A tibble: 4 x 14
  Gene   .y.   group1 group2    n1    n2        p p.signif    p.adj p.adj.signif
  <chr>  <chr> <chr>  <chr>  <int> <int>    <dbl> <chr>       <dbl> <chr>       
1 HNRNPL Value DMSO   CHX        7     7  6.77e-7 ****      6.77e-7 ****        
2 STMN2  Value DMSO   CHX        7     7  8.05e-2 ns        8.05e-2 ns          
3 UNC13A Value DMSO   CHX        7     7  3.64e-4 ***       3.64e-4 ***         
4 UNC13B Value DMSO   CHX        8     8  3.72e-4 ***       3.72e-4 ***         
# … with 4 more variables: y.position <dbl>, groups <named list>, xmin <dbl>,
#   xmax <dbl>
nmd_plot_chx = ggbarplot(for_bar_plot,
                     x = 'Gene',
                     add = c("mean_se","jitter"),
                     y = 'Value',
                     color = 'Treatment',
                     position = position_dodge(0.8),
                     dot.size = 10) +
  scale_y_continuous() + 
  ggpubr::theme_pubr() + stat_pvalue_manual(stattest, label="p.signif" , x = "Gene")
plot(nmd_plot_chx)

NMD experiment on SH-SY5Y cells - sTDP43+sUPF1

for_bar_plot_sy_input <- fread(file.path(here::here(),"data","results_upf1.csv"))

for_bar_plot_sy_df <- data.frame(for_bar_plot_sy_input)
for_bar_plot_sy_df$Condition <- as.factor(for_bar_plot_sy_df$Condition)


#assess TDP-43 and UPF1 KD
for_bar_plot_sz <- for_bar_plot_sy_df %>%
  filter(Gene == "TDP43" & (Condition == "siTDP43+siCTRL" | Condition == "siTDP43+siUPF1" | Condition == "Non-treated"))

shapiro.test(filter(for_bar_plot_sz, Condition == "siTDP43+siCTRL" & Gene == "TDP43")$Value)

    Shapiro-Wilk normality test

data:  filter(for_bar_plot_sz, Condition == "siTDP43+siCTRL" & Gene == "TDP43")$Value
W = 0.99437, p-value = 0.9787
shapiro.test(filter(for_bar_plot_sz, Condition == "siTDP43+siUPF1" & Gene == "TDP43")$Value)

    Shapiro-Wilk normality test

data:  filter(for_bar_plot_sz, Condition == "siTDP43+siUPF1" & Gene == "TDP43")$Value
W = 0.98592, p-value = 0.9357
stattest_sz <- for_bar_plot_sz %>%
  pairwise_t_test(Value ~ Condition, ref.group = "Non-treated", alternative = "less") %>%
  add_significance() %>% 
  add_xy_position(x="Condition")
stattest_sz
# A tibble: 2 x 13
  .y.   group1    group2        n1    n2        p p.signif    p.adj p.adj.signif
  <chr> <chr>     <chr>      <int> <int>    <dbl> <chr>       <dbl> <chr>       
1 Value Non-trea… siTDP43+s…     4     4 1.36e-13 ****     2.72e-13 ****        
2 Value Non-trea… siTDP43+s…     4     4 2.34e-13 ****     2.72e-13 ****        
# … with 4 more variables: y.position <dbl>, groups <named list>, xmin <dbl>,
#   xmax <dbl>
for_bar_plot_sx <- for_bar_plot_sy_df %>%
  filter(Gene == "UPF1" & (Condition == "siTDP43+siCTRL" | Condition == "siTDP43+siUPF1"))

shapiro.test(filter(for_bar_plot_sx, Condition == "siTDP43+siUPF1" & Gene == "UPF1")$Value)

    Shapiro-Wilk normality test

data:  filter(for_bar_plot_sx, Condition == "siTDP43+siUPF1" & Gene == "UPF1")$Value
W = 0.91957, p-value = 0.5345
stattest_sx <- for_bar_plot_sx %>%
  pairwise_t_test(Value ~ Condition, ref.group = "siTDP43+siCTRL", alternative = "less") %>%
  add_significance() %>% 
  add_xy_position(x="Condition")
stattest_sx
# A tibble: 1 x 13
  .y.   group1  group2     n1    n2     p p.signif p.adj p.adj.signif y.position
  <chr> <chr>   <chr>   <int> <int> <dbl> <chr>    <dbl> <chr>             <dbl>
1 Value siTDP4… siTDP4…     4     4  1e-9 ****      1e-9 ****               1.37
# … with 3 more variables: groups <named list>, xmin <dbl>, xmax <dbl>
for_bar_plot_szx <- for_bar_plot_sy_df %>%
  filter(Gene == "TDP43" & (Condition == "siTDP43+siCTRL" | Condition == "siTDP43+siUPF1" | Condition == "Non-treated")
         | Gene == "UPF1" & (Condition == "siTDP43+siCTRL" | Condition == "siTDP43+siUPF1"))

#fix position of stats for plot
stattest_sx$xmin[1] <- 1.8
stattest_sx$xmax[1] <- 2.2
stattest_sx$y.position[1] <- 1.3
stattest_sz$xmin[1] <- 0.75
stattest_sz$xmin[2] <- 0.75
stattest_sz$xmax[1] <- 1
stattest_sz$xmax[2] <- 1.25
stattest_sz$y.position[1] <- 1.2
stattest_sz$y.position[2] <- 1.3

nmd_plotz = ggbarplot(for_bar_plot_szx,
                     x = 'Gene',
                     add = c("mean_se","jitter"),
                     y = 'Value',
                     color = 'Condition',
                     position = position_dodge(0.8),
                     dot.size = 10) +
  scale_y_continuous() + ylab("Percent of transcript after UPF1 knockdown") + xlab("Gene") + 
  ggpubr::theme_pubr() + stat_pvalue_manual(stattest_sz) + stat_pvalue_manual(stattest_sx)
plot(nmd_plotz)

#assess rescue of NMD-sensitive transcripts after UPF1 inhibition
for_bar_plot_sy <- for_bar_plot_sy_df %>%
  filter(Gene == "hnRNPL" | Gene == "STMN2" | Gene == "UNC13A" | Gene == "UNC13B")

shapiro.test(filter(for_bar_plot_sy, Condition == "siTDP43+siUPF1" & Gene == "hnRNPL")$Value)

    Shapiro-Wilk normality test

data:  filter(for_bar_plot_sy, Condition == "siTDP43+siUPF1" & Gene == "hnRNPL")$Value
W = 0.97268, p-value = 0.858
shapiro.test(filter(for_bar_plot_sy, Condition == "siTDP43+siUPF1" & Gene == "STMN2")$Value)

    Shapiro-Wilk normality test

data:  filter(for_bar_plot_sy, Condition == "siTDP43+siUPF1" & Gene == "STMN2")$Value
W = 0.98571, p-value = 0.9346
shapiro.test(filter(for_bar_plot_sy, Condition == "siTDP43+siUPF1" & Gene == "UNC13A")$Value)

    Shapiro-Wilk normality test

data:  filter(for_bar_plot_sy, Condition == "siTDP43+siUPF1" & Gene == "UNC13A")$Value
W = 0.87364, p-value = 0.3122
shapiro.test(filter(for_bar_plot_sy, Condition == "siTDP43+siUPF1" & Gene == "UNC13B")$Value)

    Shapiro-Wilk normality test

data:  filter(for_bar_plot_sy, Condition == "siTDP43+siUPF1" & Gene == "UNC13B")$Value
W = 0.96734, p-value = 0.8249
stattest_sy <- for_bar_plot_sy %>%
  group_by(Gene) %>%
  pairwise_t_test(Value ~ Condition, ref.group = "siTDP43+siCTRL", alternative = "greater") %>%
  add_significance() %>% 
  add_xy_position(x="Condition")
stattest_sy
# A tibble: 4 x 14
  Gene   .y.   group1  group2     n1    n2       p p.signif   p.adj p.adj.signif
  <chr>  <chr> <chr>   <chr>   <int> <int>   <dbl> <chr>      <dbl> <chr>       
1 hnRNPL Value siTDP4… siTDP4…     4     4 6.38e-4 ***      6.38e-4 ***         
2 STMN2  Value siTDP4… siTDP4…     4     4 9.91e-1 ns       9.91e-1 ns          
3 UNC13A Value siTDP4… siTDP4…     4     4 7.64e-3 **       7.64e-3 **          
4 UNC13B Value siTDP4… siTDP4…     4     4 8.26e-4 ***      8.26e-4 ***         
# … with 4 more variables: y.position <dbl>, groups <named list>, xmin <dbl>,
#   xmax <dbl>
nmd_plot = ggbarplot(for_bar_plot_sy, 
                     x = 'Gene', 
                     add = c("mean_se","jitter"), 
                     y = 'Value',
                     color = 'Condition',
                     fill = "Condition",
                     position = position_dodge(0.8), 
                     dot.size = 20,
                     size = 1.5) +
  scale_y_continuous() + 
  ggpubr::theme_pubr() + ylab("Percent of transcript after UPF1 knockdown") + xlab("Gene") +
  stat_pvalue_manual(stattest_sy, label="p.signif", x = "Gene") +
    theme(axis.ticks.length=unit(0.1,"inch"),
        axis.line = element_line(colour = 'black', size = 1.5),
        axis.ticks = element_line(colour = "black", size = 2))+
  scale_fill_manual(values = c("#fca361ff","#aac043ff")) +
  scale_color_manual(values = c("#000000ff","#000000ff"))

plot(nmd_plot)

iPSC protein abundance mass spec

peptides = fread(file.path(here::here(),"data","peptide_abdundance.csv"))

peptides %>%
    mutate(gene = fct_relevel(gene, "UNC13A","UNC13B")) %>%
    ggbarplot(,
              x = "condition",
              add = c("mean_se"),
              y = "protein",
              fill = 'condition',
              color = 'condition',
              position = position_dodge(0.8),
              facet.by = 'gene') +
    ggpubr::theme_pubr() +
    scale_fill_manual(
        values = c("#40B0A6","#E1BE6A")
    ) +
    scale_color_manual(
        values = c("#1C2617","#262114")
    ) +
    ylab("Protein abundance") +
    xlab("") +
    geom_jitter(pch = 21,height = 0,aes(fill = condition),size = 2) +
    guides(color = FALSE) +
    facet_wrap(~gene, scales = 'free') +
    theme(legend.position = 'none') +
    scale_y_continuous(labels = scientific) +
    stat_compare_means(comparisons = list(c("Control","TDP-43 KD")),
                       label = 'p.signif',tip.length = 0,
                       size = 10)

Patient summary statistics

force_colors = c("#F8766D", "#A3A500", "#00BF7D", "#00B0F6", "#E76BF3")
names(force_colors) = c("Control","ALS \n non-TDP","ALS-TDP","FTLD \n non-TDP","FTLD-TDP")
clean_data_table = fread(file.path(here::here(),"data","nygc_junction_information.csv"))
clean_data_table = clean_data_table %>%
    mutate(rs12973192 = fct_relevel(rs12973192,
                                    "C/C", "C/G", "G/G")) %>%
    mutate(number_g_alleles = as.numeric(rs12973192) - 1) %>%
    mutate(unc13a_cryptic_leaf_psi = ifelse(is.na(unc13a_cryptic_leaf_psi),0,unc13a_cryptic_leaf_psi)) %>%
    mutate(junction_reads_stmn2 = STMN2_annotated_leaf + STMN2_cryptic_leaf) %>%
    mutate(junction_reads_unc13a = UNC13A_3prime_leaf + UNC13A_5prime_1_leaf + UNC13A_annotated_leaf) %>%
    unique()

print(glue::glue("Number of unique patients: {clean_data_table[,length(unique(participant_id))]}"))
Number of unique patients: 377
print(glue::glue("Number of unique tissue samples: {clean_data_table[,length(unique(sample))]}"))
Number of unique tissue samples: 1349
print("Patients Per Disease Category")
[1] "Patients Per Disease Category"
clean_data_table[,length(unique(participant_id)),by = disease]
   disease  V1
1: ALS-FTD  23
2:     ALS 193
3: Control  77
4:   Other  11
5:     FTD  61
6:  ALS-AD  12
print("Tissues Per Disease Category")
[1] "Tissues Per Disease Category"
clean_data_table[,length(unique(sample)),by = disease]
   disease  V1
1: ALS-FTD 110
2:     ALS 764
3: Control 199
4:   Other  70
5:     FTD 138
6:  ALS-AD  68
print("Number of patients per UNC13A SNP genotype")
[1] "Number of patients per UNC13A SNP genotype"
unique(clean_data_table[,.(participant_id,rs12608932,rs12973192)]) %>% select(-participant_id) %>% table()
          rs12973192
rs12608932 C/C C/G G/G
       A/A 166   0   0
       A/C   0 152   1
       C/C   0   1  57
print("Number of tissues per disease")
[1] "Number of tissues per disease"
clean_data_table[,.N,by = c("disease","tissue_clean")]
    disease         tissue_clean   N
 1: ALS-FTD       Frontal_Cortex  22
 2:     ALS       Frontal_Cortex 132
 3: Control       Frontal_Cortex  40
 4:   Other       Frontal_Cortex  11
 5: ALS-FTD   Lumbar_Spinal_Cord  15
 6:     ALS   Lumbar_Spinal_Cord 105
 7: Control   Lumbar_Spinal_Cord  33
 8:   Other   Lumbar_Spinal_Cord   9
 9: ALS-FTD Cervical_Spinal_Cord  14
10:     ALS Cervical_Spinal_Cord 103
11: Control Cervical_Spinal_Cord  32
12:   Other Cervical_Spinal_Cord  10
13: ALS-FTD         Motor_Cortex  28
14:     ALS         Motor_Cortex 175
15: Control         Motor_Cortex  23
16:   Other         Motor_Cortex  16
17: ALS-FTD           Cerebellum  13
18:     ALS           Cerebellum 129
19: Control           Cerebellum  28
20:   Other           Cerebellum   8
21:     FTD           Cerebellum  58
22:     FTD       Frontal_Cortex  45
23:  ALS-AD           Cerebellum  11
24:  ALS-AD         Motor_Cortex  13
25:  ALS-AD Cervical_Spinal_Cord  10
26:  ALS-AD   Lumbar_Spinal_Cord  11
27:  ALS-AD       Frontal_Cortex  12
28:  ALS-AD     Occipital_Cortex   7
29:  ALS-AD Thoracic_Spinal_Cord   4
30:     ALS     Occipital_Cortex  37
31:     ALS Thoracic_Spinal_Cord  33
32: ALS-FTD     Occipital_Cortex   6
33: Control      Temporal_Cortex  23
34:     ALS      Temporal_Cortex  23
35:     FTD      Temporal_Cortex  35
36:   Other     Occipital_Cortex   7
37:   Other Thoracic_Spinal_Cord   6
38: Control Thoracic_Spinal_Cord   5
39: Control     Occipital_Cortex   5
40: ALS-FTD Thoracic_Spinal_Cord   5
41:     ALS          Hippocampus  27
42: ALS-FTD          Hippocampus   7
43:   Other          Hippocampus   3
44: Control          Hippocampus  10
    disease         tissue_clean   N
print("Number of partcipants by mutation and  disease")
[1] "Number of partcipants by mutation and  disease"
clean_data_table[,length(unique(participant_id)),by = c("mutations","disease")]
    mutations disease  V1
 1:      None ALS-FTD  13
 2:      None     ALS 145
 3:   C9orf72 ALS-FTD  10
 4:      None Control  77
 5:      None   Other  11
 6:      SOD1     ALS   8
 7:      OPTN     ALS   4
 8:   C9orf72     ALS  32
 9:     MATR3     ALS   1
10:       ANG     ALS   1
11:   C9orf72     FTD  12
12:      None  ALS-AD  11
13:      None     FTD  42
14:   C9orf72  ALS-AD   1
15:      TBK1     FTD   2
16:      MAPT     FTD   5
17:       FUS     ALS   2
print(glue::glue("Number of patients per pathology:"))
Number of patients per pathology:
clean_data_table[,length(unique(participant_id)),by = .(pathology)]
    pathology  V1
 1:   ALS-FTD  23
 2:       ALS 193
 3:   control  77
 4:     Other  11
 5:            13
 6:    ALS-AD  12
 7: FTD-TDP-A  24
 8: FTD-TDP-B   3
 9: FTD-TDP-C   9
10:   FTD-TAU   7
11:   FTD-FUS   5

UNC13A cryptic is an event that is specific to TDP-43 proteinopathy

FTLD-non-TDP are those with TAU and FUS aggregates

Non-tdp ALS are those with SOD1 or FUS mutations. The n’s are quite low on this unfortunately, only 8 ALS with SOD1 and 2 with FUS mutations.

First we look at detection rate in tissues affected by TDP-43 proteinopathy, For FTLD this is frontal and temporal Cortices, and for ALS this is lumbar, cervical, and thoracic spinal cord samples as well as motor cortex. For controls we also take all 6 tissues, frontal,temporal,motor cortices and the lumbar, cervical, and thoracic spinal cords.

(As a side note, once we do this the number of ALS-non-TDP drops down to 6 (2 FUS) because the ALS sample tissues are not balanced and not every participant has samples in every tissue)

####Inclusion reads by if TDP-potential####
boxplot_table = clean_data_table %>%
    mutate(across(UNC13A_3prime_leaf:UNC13A_annotated_leaf, ~ .x / library_size,.names = "{.col}_library_norm")) %>%
    filter(!tissue_clean %in% c("Choroid","Liver")) %>%
    dplyr::select(sample,participant_id,mutations,disease_group2,pathology,tissue_clean,contains("_library_norm")) %>%
    melt() %>%
    filter(grepl("_3prime|_5prime_1",variable)) %>%
    group_by(sample) %>%
    mutate(inclusion_reads = sum(value)) %>%
    ungroup() %>%
    unique() %>%
    mutate(junction_name = case_when(variable == "UNC13A_3prime_leaf_library_norm" ~ "  Novel Donor",
                                 variable == "UNC13A_5prime_1_leaf_library_norm" ~ " Short Novel Acceptor",
                                 variable == "UNC13A_5prime_2_leaf_library_norm"~ "Long Novel Acceptor")) %>%
    mutate(disease_tissue = case_when((grepl("FTLD",disease_group2) & grepl("Cortex",tissue_clean))  ~ T,
                                      (grepl("ALS",disease_group2) & grepl("Cord|Motor",tissue_clean))  ~ T,
                                      (grepl("Occipital",tissue_clean)) ~ F,
                                      (grepl("Control",disease_group2) & grepl("Cord|Cortex",tissue_clean)) ~ T,
           TRUE ~ F)) %>%
    mutate(tissue_clean = gsub("_"," ",tissue_clean))

melt_count = clean_data_table[,.(sample,UNC13A_3prime_leaf,UNC13A_5prime_1_leaf,UNC13A_5prime_2_leaf)] %>% data.table::melt() %>% setnames(.,"value","orig_count")

Pecent of patients UNC13A detected

Looking at disease tissue only, so just taking the cord and motor cortex in ALS and the frontal and temporal cortex of FTLD and then the cord and cortices in Controls.

clean_data_table %>%
  filter(disease_tissue == T) %>%
  mutate(inclusion_reads = UNC13A_3prime_leaf + UNC13A_5prime_1_leaf) %>% 
  mutate(detected = inclusion_reads > 0) %>%
  dplyr::select(participant_id,disease_group2,detected) %>%
  unique() %>% 
  group_by(disease_group2) %>%
  mutate(n_sample = n_distinct(participant_id)) %>%
  mutate(n_sample_detected = sum(detected)) %>%
  dplyr::select(disease_group2,n_sample,n_sample_detected) %>%
  unique() %>%
  mutate(detection_rate = n_sample_detected / n_sample) %>%
  mutate(disease_group2 = gsub("Control"," Control",disease_group2)) %>%
  mutate(detection_name = glue::glue("{disease_group2} \n ( {n_sample} )")) %>%
  ggplot() +
  geom_col(aes(x = detection_name, y = detection_rate)) +
  ggpubr::theme_pubr() +
  scale_y_continuous(lim = c(0,1),labels = scales::percent) +
  ylab("Percent of Patients \n UNC13A Cryptic Detected") +
  theme(text = element_text(size = 24)) +
  xlab("N individuals")

Effect of sequencing machine (read length) on CE discovery

# library size 
#llumina HiSeq 2500 (125 bp paired end) or an Illumina NovaSeq (100 bp paired end). 
unc13a_sequencing_platform <- clean_data_table %>%
  filter(tissue_clean %in% c("Frontal_Cortex", 
                             "Lumbar_Spinal_Cord", 
                             "Cervical_Spinal_Cord", 
                             "Motor_Cortex", 
                             "Temporal_Cortex")) %>% 
  mutate(inclusion_reads = UNC13A_3prime_leaf + UNC13A_5prime_1_leaf) %>% 
  mutate(unc13a_detected = ifelse(inclusion_reads > 0, yes = "+", no = "-")) %>% 
  mutate(tissue = as.character(tissue)) %>%
  filter(sequencing_platform != "") %>% 
  filter(disease_group2 %in% c("ALS-TDP","FTLD-TDP"))

quick_fisher = function(tbl){
  
    if(length(unique(tbl$sequencing_platform)) < 2){
      return(NA)
    }
    comparison_table = table(unc13a = tbl$unc13a_detected, 
                             sequencing_platform = tbl$sequencing_platform)
    p_value = fisher.test(comparison_table)$p.value
    
    return(p_value)
}



unc13a_data_fisher = unc13a_sequencing_platform %>% 
  filter(tissue_clean != "Thoracic_Spinal_Cord") %>% 
  group_by(disease_group2,tissue_clean) %>% 
  nest() %>% 
  mutate(fisher_pvalue = map_dbl(data, quick_fisher)) 



unc13a_data_fisher$data = NULL

unc13a_sequencing_platform_pct = unc13a_sequencing_platform %>%
  group_by(disease_group2,tissue_clean, sequencing_platform, unc13a_detected) %>% tally() %>%
  spread(key = unc13a_detected, value = n,fill = 0) %>%
  mutate(detection = `+` / ( `-` + `+`) ) %>% 
  mutate(total =  ( `-` + `+`) ) %>% 
  mutate(plot_name = glue::glue("{sequencing_platform} \n  ({total})")) %>% 
  left_join(unc13a_data_fisher)
Joining, by = c("disease_group2", "tissue_clean")
unc13a_sequencing_platform_pct %>%
  ggplot(aes(x = plot_name, y = detection )) + 
  geom_col(fill = "firebrick") +
  labs(title = "UNC13A CE detection and sequencing platform", 
       y = "UNC13A CE detected",x = element_blank(), subtitle = "Fisher exact test") + 
  facet_wrap(~disease_group2 + tissue_clean,scales = 'free_x') +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  geom_text(aes(x = 1.5, y = 1.05, label = paste0("p = ", signif(fisher_pvalue,digits = 2) ) )) +
  theme(legend.text = element_text(size = 15)) +
  ggpubr::theme_pubr() +
  scale_fill_manual(values = ("#b3251f"))

 # scale_y_continuous(expand = c(0,0) ) 

Sequencing platform and disease type

Were there systemic differences in sequence platform between ALS and FTD?

tmp = clean_data_table %>%
  filter(tissue_clean %in% c("Frontal_Cortex", 
                             "Lumbar_Spinal_Cord", 
                             "Cervical_Spinal_Cord", 
                             "Motor_Cortex", 
                             "Temporal_Cortex")) %>% 
  mutate(inclusion_reads = UNC13A_3prime_leaf + UNC13A_5prime_1_leaf) %>% 
  mutate(unc13a_detected = ifelse(inclusion_reads > 0, yes = "+", no = "-")) %>% 
  mutate(tissue = as.character(tissue)) %>%
  filter(sequencing_platform != "") %>% 
  filter(disease_group2 %in% c("ALS-TDP","FTLD-TDP")) %>% 
  dplyr::select(tissue_clean,disease_group2,sequencing_platform) %>% 
  group_by(tissue_clean) %>% 
  nest()

tmp %>% 
  mutate(chisq_res = map(data, ~ .x %>% table() %>% chisq.test() %>% pluck('residuals')))
# A tibble: 5 x 3
# Groups:   tissue_clean [5]
  tissue_clean         data               chisq_res      
  <chr>                <list>             <list>         
1 Frontal_Cortex       <tibble [199 × 2]> <table [2 × 2]>
2 Lumbar_Spinal_Cord   <tibble [125 × 2]> <dbl [2]>      
3 Cervical_Spinal_Cord <tibble [130 × 2]> <dbl [2]>      
4 Motor_Cortex         <tibble [201 × 2]> <dbl [2]>      
5 Temporal_Cortex      <tibble [52 × 2]>  <table [2 × 2]>

Library size by disease status and tissue

# library size 

plot_list = list()

lib_comps = c("Cervical_Spinal_Cord", "Frontal_Cortex", 
              "Lumbar_Spinal_Cord", "Temporal_Cortex",
              "Motor_Cortex", "Cerebellum")

plot_list = list()
for(t in 1:length(lib_comps)){
  plt = clean_data_table %>%
     filter(tissue_clean == lib_comps[t]) %>% 
    mutate(inclusion_reads = UNC13A_3prime_leaf + UNC13A_5prime_1_leaf) %>% 
    mutate(unc13a_detected = ifelse(inclusion_reads > 0, yes = "+", no = "-")) %>% 
    mutate(disease_group2 = fct_relevel(disease_group2, "Control")) %>% 
    ggplot(aes(x = disease_group2,y = log10(library_size),fill = disease_group2)) + 
    geom_boxplot(colour = "black", outlier.colour = NA) +
    geom_point(pch = 21, size = 2, alpha = 0.9, position = position_jitter(width = 0.2, height = 0) ) + 
    facet_wrap(~ tissue_clean,scales = 'free_x') + 
    # ggpubr::stat_compare_means(label = "p.format", label.x.npc = "center", method = "wilcox" ) +
    ggpubr::stat_compare_means(label = "p.signif",
                               label.x.npc = "center",
                               method = "wilcox",
                               comparisons = list(
                                                  c("Control","ALS-TDP"),
                                                  c("ALS \n non-TDP","ALS-TDP"))) +
      ggpubr::stat_compare_means(label = "p.signif", 
                               label.x.npc = "center", 
                               method = "wilcox",
                               comparisons = list(
                                                  c("Control","FTLD-TDP"),
                                                  c("FTLD-TDP","FTLD \n non-TDP"))) + 
    labs(y = "log10 library size", fill= "UNC13A CE detected", x = element_blank()) +
    theme(axis.text.x = element_text(size = 15)) + 
    ggpubr::theme_pubr() + 
    scale_fill_manual(values = force_colors) +
            theme(axis.ticks.x = element_blank(),
              axis.text.x = element_blank(),
              legend.position = "none") + 
    ylab(element_blank())
  
  plot_list[[t]] <- plt
}

cowplot::plot_grid(plotlist = plot_list,nrow = 3)

Effect of library depth on CE discovery

unc13a_library_depth <- clean_data_table %>%
  filter(tissue_clean %in% c("Frontal_Cortex", 
                             "Lumbar_Spinal_Cord", 
                             "Cervical_Spinal_Cord", 
                             "Motor_Cortex", 
                             "Temporal_Cortex")) %>% 
  mutate(inclusion_reads = UNC13A_3prime_leaf + UNC13A_5prime_1_leaf) %>% 
  mutate(unc13a_detected = ifelse(inclusion_reads > 0, yes = "+", no = "-")) %>% 
  filter(disease_group2 %in% c("ALS-TDP","FTLD-TDP"))



unc13a_library_depth %>%
  group_by(disease_group2,tissue_clean) %>% 
  add_count() %>% 
  mutate(plot_name = glue::glue("{tissue_clean} \n {n}")) %>% 
  ggplot(aes(x = unc13a_detected, y = log10(library_size))) + 
  geom_boxplot(aes(fill = unc13a_detected), colour = "black", outlier.colour = NA) + 
  facet_wrap(~disease_group2+ plot_name) + 
  geom_jitter(height = 0,size = 2,alpha = 0.3) + 
  ggpubr::stat_compare_means(label = "p.format", label.x.npc = "center", method = "wilcox", label.y.npc = 0.95) + 
  ggpubr::theme_pubr() +
  xlab("UNC13A CE Detected") + 
  scale_fill_manual(values = c("#EBC3B9","#b3251f")) + 
  theme(legend.position = 'none')

RIN between cases and controls

force_colors = c("#F8766D", "#A3A500", "#00BF7D", "#00B0F6", "#E76BF3")
names(force_colors) = c("Control","ALS \n non-TDP","ALS-TDP","FTLD \n non-TDP","FTLD-TDP")

plot_list = list()

lib_comps = c("Cervical_Spinal_Cord", "Frontal_Cortex", 
              "Lumbar_Spinal_Cord", "Temporal_Cortex",
              "Motor_Cortex", "Cerebellum")

plot_list = list()
for(t in 1:length(lib_comps)){

        groups = c("ALS-TDP", "Control", "ALS \n non-TDP", "FTLD-TDP", "FTLD \n non-TDP")
    
    
    plt = clean_data_table %>%
        filter(tissue_clean == lib_comps[t]) %>% 
        mutate(disease_group2 = fct_relevel(disease_group2, "Control")) %>% 
        filter(disease_group2 %in% groups) %>% 
        
        ggplot(aes(x = disease_group2,y = rin,fill =disease_group2 )) + 
        geom_boxplot(colour = "black", outlier.colour = NA) +
        geom_point(pch = 21, size = 2, alpha = 0.9, position = position_jitter(width = 0.2, height = 0) ) + 
        facet_wrap(~ tissue_clean,scales = 'free_x') + 
        # ggpubr::stat_compare_means(label = "p.format", label.x.npc = "center", method = "wilcox" ) +
        ggpubr::stat_compare_means(label = "p.signif",
                                   label.x.npc = "center",
                                   method = "wilcox",
                                   comparisons = list(
                                       c("Control","ALS-TDP"),
                                       c("ALS \n non-TDP","ALS-TDP"))) +
        ggpubr::stat_compare_means( label = "p.signif",
                                   label.x.npc = "center", 
                                   method = "wilcox",
                                   comparisons = list(
                                       c("Control","FTLD-TDP"),
                                       c("FTLD-TDP","FTLD \n non-TDP"))) + 
        labs(y = "RIN", x = element_blank()) +
        theme(axis.text.x = element_text(size = 15)) + 
        ggpubr::theme_pubr() + 
        theme(axis.ticks.x = element_blank(),
              axis.text.x = element_blank(),
              legend.position = "none") +
      scale_color_manual(values = force_colors) + 
      scale_fill_manual(values = force_colors)
    
    plot_list[[t]] <- plt
}

cowplot::plot_grid(plotlist = plot_list,ncol = 2)    

RIN and detection in cases

unc13a_rin <- clean_data_table %>%
  filter(tissue_clean %in% c("Frontal_Cortex", 
                             "Lumbar_Spinal_Cord", 
                             "Cervical_Spinal_Cord", 
                             "Motor_Cortex", 
                             "Temporal_Cortex")) %>% 
  mutate(inclusion_reads = UNC13A_3prime_leaf + UNC13A_5prime_1_leaf) %>% 
  mutate(unc13a_detected = ifelse(inclusion_reads > 0, yes = "+", no = "-")) %>% 
  filter(disease_group2 %in% c("ALS-TDP","FTLD-TDP")) %>% 
  filter(rin != "")



unc13a_rin %>%
  group_by(disease_group2,tissue_clean) %>% 
  add_count() %>% 
  mutate(plot_name = glue::glue("{tissue_clean} \n {n}")) %>% 
  ggplot(aes(x = unc13a_detected, y = rin)) + 
  geom_boxplot(aes(fill = unc13a_detected), colour = "black", outlier.colour = NA) + 
  facet_wrap(~disease_group2+ plot_name) + 
  ylab("RIN") +
  geom_jitter(height = 0,size = 2,alpha = 0.3) + 
  ggpubr::stat_compare_means(label = "p.format", label.x.npc = "center", method = "wilcox", label.y.npc = 0.95) + 
  ggpubr::theme_pubr() +
  xlab("UNC13A CE Detected") +
  scale_fill_manual(values = c("#EBC3B9","#b3251f")) + 
  theme(legend.position = 'none')

Effect of UNC13A discovery and UNC13B TPM

## effect of RIN?
clean_data_table %>%
  filter(disease_tissue == T) %>% 
  filter(disease_group2 %in% c("FTLD-TDP", "ALS-TDP")) %>%
  add_count(tissue_clean,disease_group2) %>% 
  mutate(plot_name = glue::glue("{disease_group2} \n ({n})")) %>% 
  mutate(inclusion_reads = UNC13A_3prime_leaf + UNC13A_5prime_1_leaf) %>% 
  mutate(unc13a_detected = ifelse(inclusion_reads > 0, yes = "+", no = "-") ) %>% 
  ggplot(aes(x = plot_name, fill = unc13a_detected,y = UNC13B_TPM )) + 
  geom_point(size = 2,show.legend = F,pch = 21, position = position_jitterdodge(dodge.width=1,jitter.width = 0.3))+
  geom_boxplot(outlier.colour = NA) +
  labs(title = "UNC13A CE detection and UNC13B TPM",subtitle = "Wilcox test") + 
  facet_wrap(~ tissue_clean,scales = 'free') + 
  ggpubr::stat_compare_means(label = "p.signif") +
  labs(y = "UNC13B TPM", x = "UNC13A CE detection") +
  theme(axis.text.x = element_text(size = 15)) +
  ggpubr::theme_pubr() +
  ggpubr::stat_compare_means(label = "p.format", label.x.npc = "center",label.y.npc = 0.9)

Celltype composition between cases and controls

force_colors = c("#F8766D", "#A3A500", "#00BF7D", "#00B0F6", "#E76BF3")
names(force_colors) = c("Control","ALS \n non-TDP","ALS-TDP","FTLD \n non-TDP","FTLD-TDP")

plot_list = list()

lib_comps = c("Cervical_Spinal_Cord", "Frontal_Cortex", 
              "Lumbar_Spinal_Cord", "Temporal_Cortex",
              "Motor_Cortex", "Cerebellum")

cell_types = fread(paste0(here::here(),"/data/All_1917_samples_Darmanis_dtangle_deconv.tsv"))
for(t in 1:length(lib_comps)){
    
    

        groups = c("ALS-TDP", "Control", "ALS \n non-TDP", "FTLD-TDP", "FTLD \n non-TDP")
    

    plt = cell_types %>% 
        left_join(clean_data_table[,.(sample,
                                      UNC13A_3prime_leaf,
                                      UNC13A_5prime_1_leaf,
                                      disease_group2,
                                      disease_tissue)], by = 'sample') %>%
        filter(!is.na(disease_group2)) %>% 
        unique() %>% 
        filter(tissue_clean == lib_comps[t]) %>% 
        filter(disease_group2 %in% groups) %>% 
        mutate(disease_group2 = fct_relevel(disease_group2, "Control")) %>% 
        ggplot(aes(x = disease_group2,y = deconv,fill =disease_group2 )) + 
        geom_boxplot(colour = "black", outlier.colour = NA) +
        facet_wrap(~cell,scales = 'free_x',nrow = 1) +
        ggpubr::stat_compare_means(label = "p.signif", 
                                   label.x.npc = "center", 
                                   method = "wilcox",
                                   comparisons = list(
                                       c("Control","FTLD-TDP"),
                                       c("FTLD-TDP","FTLD \n non-TDP"))) +
        ggpubr::stat_compare_means(label = "p.signif",
                                   label.x.npc = "center",
                                   method = "wilcox",
                                   comparisons = list(
                                       c("Control","ALS-TDP"),
                                       c("ALS \n non-TDP","ALS-TDP"))) +
        ggtitle(lib_comps[t]) + 
        labs(fill = element_blank(),x = element_blank()) + 
        geom_point(pch = 21, size = 2, alpha = 0.9, position = position_jitter(width = 0.2, height = 0) ) + 
        ggpubr::theme_pubr() +
        theme(axis.ticks.x = element_blank(),
              axis.text.x = element_blank(),
              legend.position = "none") + 
        scale_fill_manual(values = force_colors)
plot_list[[t]] <- plt

}

cowplot::plot_grid(plotlist = plot_list,ncol = 2)    

Effect of celltype composition on UNC13A CE detection

cell_types = fread("/Users/annaleigh/Documents/GitHub/unc13a_cryptic_splicing/data/All_1917_samples_Darmanis_dtangle_deconv.tsv") 


cell_types %>% 
  left_join(clean_data_table[,.(sample,
                               UNC13A_3prime_leaf,
                               UNC13A_5prime_1_leaf,
                               disease_group2,
                               disease_tissue)], by = 'sample') %>%
  filter(tissue_clean %in% c("Frontal_Cortex", 
                             "Lumbar_Spinal_Cord", 
                             "Cervical_Spinal_Cord", 
                             "Motor_Cortex", 
                             "Temporal_Cortex")) %>% 
  mutate(inclusion_reads = UNC13A_3prime_leaf + UNC13A_5prime_1_leaf) %>% 
  mutate(unc13a_detected = ifelse(inclusion_reads > 0, yes = "+", no = "-")) %>% 
  filter(disease_group2 %in% c("ALS-TDP","FTLD-TDP")) %>% 
  group_by(disease_group2,tissue_clean) %>% 
  add_count() %>% 
  mutate(n = n / 5) %>% 
  mutate(plot_name = glue::glue("{tissue_clean} \n {n}")) %>% 
  filter(!is.na(inclusion_reads)) %>% 
  unique() %>% 
  ggplot(aes(x = cell, y = deconv, fill = unc13a_detected)) + 
  # geom_point(size = 2,show.legend = F,pch = 21, position = position_jitterdodge(dodge.width=1,jitter.width = 0.3))+
  geom_boxplot(outlier.color = NA) + 
  facet_wrap(~disease_group2+plot_name,scales = 'free') +
  ggpubr::stat_compare_means(label = "p.format",vjust = 1.3) +
  ggpubr::theme_pubr() +
  scale_fill_manual(values = c("#EBC3B9","#b3251f")) + 
  theme(legend.position = 'none') +
  xlab(element_blank())

Pecent of samples UNC13A detected by tissue

boxplot_table %>%
  filter(!(tissue_clean %in% c("Cerebellum","Hippocampus","Occipital Cortex"))) %>%
  mutate(detected = inclusion_reads > 0) %>%
  dplyr::select(sample,disease_group2,detected,tissue_clean) %>%
  unique() %>%
  group_by(disease_group2,tissue_clean) %>%
  mutate(n_sample = n_distinct(sample)) %>%
  mutate(n_sample_detected = sum(detected)) %>%
  dplyr::select(tissue_clean,disease_group2,n_sample,n_sample_detected) %>%
  unique() %>%
  mutate(detection_rate = n_sample_detected / n_sample) %>%
  mutate(disease_group2 = gsub("Control"," Control",disease_group2)) %>%
  ungroup() %>%
  filter(grepl("ALS-TDP|FTLD-T",disease_group2)) %>%
  mutate(tissue_clean = fct_relevel(tissue_clean, "Cervical Spinal Cord",
                                    "Frontal Cortex",
                                    "Lumbar Spinal Cord",
                                    "Motor Cortex",
                                    "Thoracic Spinal Cord")) %>%
    mutate(detection_name = glue::glue("{disease_group2} \n ( {n_sample} )")) %>%
  mutate(cortex = ifelse(grepl(" Cord",tissue_clean),"cot","cor")) %>%
  ggplot() +
  geom_col(aes(x = detection_name,
               y = detection_rate,
               fill = detection_name),color = 'black',position = position_dodge2(width = 0.8, preserve = "single")) +
  ggpubr::theme_pubr() +
  ylab("Percent of Tissues \n UNC13A Cryptic Detected") +
  theme(text = element_text(size = 18)) +
  xlab("N tissues") +
  facet_wrap(~tissue_clean,scales = 'free_x',nrow = 3) +
    scale_y_continuous(lim = c(0,1),labels = scales::percent,expand = c(0,0) ) +
  theme(axis.text.x =  element_text(size = 14)) +
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  theme(text = element_text(size = 18))  +
  scale_fill_manual(values = c("#00bd7dff","#00bd7dff","#00bd7dff","#00bd7dff","#00bd7dff","red","#e668f3ff","#e668f3ff")) +
  theme(legend.position = "none")

Pecent of samples UNC13A detected by tissue and genotype

boxplot_table %>%
  left_join(clean_data_table %>% dplyr::select(sample, rs12973192)) %>%
  unique() %>%
  filter(!(tissue_clean %in% c("Cerebellum","Hippocampus","Occipital Cortex"))) %>%
  mutate(detected = inclusion_reads > 0) %>%
  dplyr::select(sample,disease_group2,detected,tissue_clean,rs12973192) %>%
  unique() %>%
  group_by(disease_group2,tissue_clean,rs12973192) %>%
  mutate(n_sample = n_distinct(sample)) %>%
  mutate(n_sample_detected = sum(detected)) %>%
  dplyr::select(tissue_clean,disease_group2,n_sample,n_sample_detected) %>%
  unique() %>%
  mutate(detection_rate = n_sample_detected / n_sample) %>%
  mutate(disease_group2 = gsub("Control"," Control",disease_group2)) %>%
  ungroup() %>%
  filter(grepl("ALS-TDP|FTLD-T",disease_group2)) %>%
  mutate(tissue_clean = fct_relevel(tissue_clean, "Cervical Spinal Cord",
                                    "Frontal Cortex",
                                    "Lumbar Spinal Cord",
                                    "Motor Cortex",
                                    "Thoracic Spinal Cord")) %>%
    mutate(detection_name = glue::glue("{disease_group2} \n ( {n_sample} )")) %>%
  mutate(cortex = ifelse(grepl(" Cord",tissue_clean),"cot","cor")) %>%
  ggplot() +
  geom_col(aes(x = detection_name, y = detection_rate,fill = rs12973192),position = position_dodge2(width = 0.8, preserve = "single")) +
  ggpubr::theme_pubr() +
  ylab("Percent of Tissues \n UNC13A Cryptic Detected") +
  theme(text = element_text(size = 18)) +
  xlab("N tissues") +
  facet_wrap(~tissue_clean,scales = 'free_x',nrow = 3) +
    scale_y_continuous(lim = c(0,1),labels = scales::percent,expand = c(0,0) ) +
  theme(axis.text.x =  element_text(size = 14)) +
      theme_bw() +
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  theme(text = element_text(size = 18))  +
  theme(legend.position = "bottom")
Joining, by = "sample"
Adding missing grouping variables: `rs12973192`

UNC13A CE level across tissue

This only plots the short novel acceptor and novel donor

star_maker = function (p.value) 
{
    unclass(symnum(p.value, corr = FALSE, na = FALSE, cutpoints = c(0, 0.0001,
        0.001, 0.01, 0.05, 0.1, 1), symbols = c("****","***", "**", 
        "*", ".", " ")))
}
only_short_plotter = boxplot_table %>%
    filter(tissue_clean %in%
               c("Frontal Cortex",
                 "Lumbar Spinal Cord",
                 "Cervical Spinal Cord",
                 "Motor Cortex",
                 "Temporal Cortex",
                 "Cerebellum") )%>%
    group_by(disease_group2,tissue_clean) %>%
    mutate(n_sample = n_distinct(sample)) %>%
    ungroup() %>%
    mutate(disease_group2 = gsub("Control", " Control",disease_group2)) %>%
    mutate(detection_name = glue::glue("{disease_group2} \n ( {n_sample} )"))  %>%
    mutate(tissue_clean = fct_relevel(tissue_clean, "Cervical Spinal Cord",
                                      "Frontal Cortex",
                                      "Lumbar Spinal Cord",
                                      "Temporal Cortex",
                                      "Motor Cortex"))



only_short_plotter %>% 
  dplyr::select(sample,tissue_clean,inclusion_reads,disease_group2) %>% 
  mutate(disease_group2 = gsub("\n", "",disease_group2)) %>% 
  unique() %>% 
  group_by(tissue_clean) %>% 
  nest() %>%
  mutate(wilcox_res = map(data, ~pairwise.wilcox.test(.x$inclusion_reads,.x$disease_group2) %>% broom::tidy())) %>%
  dplyr::select(-data) %>% 
  unnest() %>% 
  mutate(star_plot = star_maker(p.value)) %>% fwrite("~/Desktop/stars_for_weaverly_fig3b.csv")
  


only_short_plotter %>% 
    ggplot(aes(x = detection_name,
               y = inclusion_reads * 10^6,
               fill ="red")) +
    geom_boxplot(show.legend = F,position = position_dodge(preserve = "single",width = 1),outlier.colour = NA) +
    geom_point(size = 2,show.legend = F,pch = 21, position = position_jitterdodge(dodge.width=1,jitter.width = 0.3))+
    ggplot2::facet_wrap(vars(tissue_clean),scales = "free_x",nrow = 3) +
    ylab("UNC13A cryptic \n reads per million") +
    theme(text = element_text(size = 12)) +
    xlab("") +
    scale_fill_manual(values = colorblind_pal()(4)[2:4]) +
    theme_bw() +
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
    theme(text = element_text(size = 18))  +
    theme(legend.position = "bottom") 

UNC13A CE level across tissue - split by junction type

clean_data_table %>%
    mutate(across(UNC13A_3prime_leaf:UNC13A_annotated_leaf, ~ .x / library_size,.names = "{.col}_library_norm")) %>%
    filter(!tissue_clean %in% c("Choroid","Liver")) %>%
    dplyr::select(sample,participant_id,mutations,disease_group2,pathology,tissue_clean,contains("_library_norm")) %>%
    melt() %>%
    filter(grepl("_3prime|_5prime_1|_5prime_2",variable)) %>%
      # filter(grepl("_5prime_2",variable)) %>%
    unique() %>%
    mutate(junction_name = case_when(variable == "UNC13A_3prime_leaf_library_norm" ~ "  Novel Donor",
                                 variable == "UNC13A_5prime_1_leaf_library_norm" ~ " Short Novel Acceptor",
                                 variable == "UNC13A_5prime_2_leaf_library_norm"~ "Long Novel Acceptor")) %>%
    mutate(tissue_clean = gsub("_"," ",tissue_clean)) %>%
  dplyr::select(sample,disease_group2,
                tissue_clean,
                junction_name,
                value) %>%
    unique() %>%
    filter(tissue_clean %in%
               c("Frontal Cortex",
                 "Lumbar Spinal Cord",
                 "Cervical Spinal Cord",
                 "Motor Cortex",
                 "Temporal Cortex",
                 "Cerebellum"))%>%
    group_by(disease_group2,tissue_clean) %>%
    mutate(n_sample = n_distinct(sample)) %>%
    ungroup() %>%
    mutate(disease_group2 = gsub("Control", " Control",disease_group2)) %>%
    mutate(detection_name = glue::glue("{disease_group2} \n ( {n_sample} )"))  %>%
    mutate(tissue_clean = fct_relevel(tissue_clean, "Cervical Spinal Cord",
                                      "Frontal Cortex",
                                      "Lumbar Spinal Cord",
                                      "Temporal Cortex",
                                      "Motor Cortex")) %>%
    ggplot(aes(x = detection_name,
               y = value * 10^6,
               fill = junction_name)) +
    geom_boxplot(show.legend = F,position = position_dodge(preserve = "single")) +
    geom_point(size = 2,pch = 21, alpha = 0.7, position = position_jitterdodge(jitter.width = 0.2))+
    scale_y_log10() +
    ggplot2::facet_wrap(vars(tissue_clean),scales = "free_x",nrow = 3) +
    ylab("UNC13A cryptic \n reads per million") +
    theme(text = element_text(size = 12)) +
    xlab("") +
    scale_fill_manual(values = colorblind_pal()(4)[2:4]) +
        theme_bw() +
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  theme(text = element_text(size = 18))  +
  theme(legend.position = "bottom")

UNC13A CE level across disease types

disease_comparisons = list( c("Control","ALS-TDP"),
                           c("Control","ALS \n non-TDP"),
                           c("Control","FTLD-TDP"),
                           c("Control","FTLD \n non-TDP" ))

table_to_test = boxplot_table %>%
    filter(disease_tissue == T) %>%
    group_by(disease_group2) %>%
    mutate(n_sample = n_distinct(sample)) %>%
    mutate(disease_group2 = gsub("Control"," Control",disease_group2)) %>%
    mutate(detection_name = glue::glue("{disease_group2} \n ( {n_sample} )")) %>%
    dplyr::select(detection_name,
                  participant_id,
                  inclusion_reads,
                  disease_group2,
                  tissue_clean,
                  disease_tissue) %>%
    unique()

test_pair = pairwise.wilcox.test(table_to_test$inclusion_reads, table_to_test$detection_name,
                     p.adjust.method = "BH") %>% broom::tidy()

test_pair = test_pair %>%
  mutate(p_value_draw = case_when(p.value < 0.0001~ "***",
                                  p.value < 0.01 ~ "**",
                                   p.value < 0.05 ~ "*",
                          TRUE ~ paste0("Adj. p-value \n",as.character(round(p.value,2))))) %>%
  mutate(y.position = seq(0.25,by = 0.1,length.out = 7))

table_to_test %>%
    ggplot(aes(x = detection_name, y = inclusion_reads * 10^6)) +
    geom_boxplot() +
    geom_jitter(height = 0) +
    scale_y_log10() +
    ggpubr::theme_pubr() +
    ylab("UNC13A cryptic inclusion \n reads per million") +
    theme(text = element_text(size = 24)) +
    xlab("N samples") +
    stat_pvalue_manual(test_pair %>% filter(p.value < 0.05),
                       label = "p_value_draw",size = 8) +
   stat_compare_means(size = 8)

Detection rate as a linear model

UNC13A RNA expression (TPM)

UNC13A TPM by disease and tissue

comps = clean_data_table %>%
  group_by(disease_group2,tissue_clean) %>%
  mutate(n_sample = n_distinct(sample)) %>%
  ungroup() %>%
  mutate(disease_group2 = gsub("Control", " Control",disease_group2)) %>%
  mutate(detection_name = glue::glue("{disease_group2} \n ( {n_sample} )"))  %>%
  mutate(tissue_clean = gsub("_"," ",tissue_clean)) %>% pull(detection_name) %>%
  unique() %>%
  combn(.,2,simplify = F)

clean_data_table %>%
  group_by(disease_group2,tissue_clean) %>%
  mutate(n_sample = n_distinct(sample)) %>%
  ungroup() %>%
  mutate(disease_group2 = gsub("Control", " Control",disease_group2)) %>%
  mutate(detection_name = glue::glue("{disease_group2} \n ( {n_sample} )"))  %>%
  mutate(tissue_clean = gsub("_"," ",tissue_clean)) %>%
 filter(tissue_clean %in%
           c("Frontal Cortex",
             "Lumbar Spinal Cord",
             "Cervical Spinal Cord",
             "Motor Cortex",
             "Temporal Cortex",
             "Cerebellum") )%>%
mutate(tissue_clean = fct_relevel(tissue_clean, "Cervical Spinal Cord",
                              "Frontal Cortex",
                              "Lumbar Spinal Cord",
                              "Temporal Cortex",
                              "Motor Cortex")) %>%
    ggplot(aes(x = detection_name, y = UNC13A_TPM)) +
    ggpubr::theme_pubr() +
    geom_boxplot(aes(fill = disease_group2)) +
    geom_jitter(pch = 21, color = 'black',width  = 0.2,height = 0,aes(fill =   disease_group2)) +
    ylab("UNC13A TPM") +
    xlab("") +
    guides(color = FALSE) +
    theme(text = element_text(size = 20)) +
  facet_wrap(~tissue_clean, scales = 'free',nrow = 3) +
  theme(legend.position = 'none') +
  stat_compare_means(comparisons = comps)

plot_by_tissue = function(tissue_name,dt){

only_tissue = dt %>%
  group_by(disease_group2,tissue_clean) %>%
  mutate(n_sample = n_distinct(sample)) %>%
  ungroup() %>%
  filter(tissue_clean == tissue_name) %>%
  mutate(disease_group2 = gsub("Control", " Control",disease_group2)) %>%
  mutate(detection_name = glue::glue("{disease_group2} \n ( {n_sample} )"))  %>%
  mutate(tissue_clean = gsub("_"," ",tissue_clean))

unique_x_values = only_tissue %>%
  pull(detection_name) %>%
  unique()

comparisons = combn(unique_x_values,2,simplify = F)

the_plot = only_tissue %>%
    ggplot(aes(x = detection_name, y = UNC13A_TPM)) +
    ggpubr::theme_pubr() +
    geom_boxplot(aes(fill = disease_group2)) +
    geom_jitter(pch = 21, color = 'black',width  = 0.2,height = 0,aes(fill =   disease_group2)) +
    ylab("UNC13A TPM") +
    xlab("") +
    guides(color = FALSE) +
    theme(text = element_text(size = 20)) +
  facet_wrap(~tissue_clean, scales = 'free',nrow = 3) +
  theme(legend.position = 'none') +
  stat_compare_means() +
  stat_compare_means(label = "p.signif",comparisons = comparisons,)

  return(the_plot)
}

dis_tis = clean_data_table[disease_tissue == T,unique(tissue_clean)]
dis_tis = c("Cerebellum",dis_tis)
my_plots = purrr::map(dis_tis,plot_by_tissue, clean_data_table)
ggarrange(plotlist=my_plots)

UNC13B fsE across NYGC cohort

for_stats = clean_data_table %>% 
    filter(tissue_clean %in%
               c("Frontal_Cortex",
                 "Lumbar_Spinal_Cord",
                 "Cervical_Spinal_Cord",
                 "Motor_Cortex",
                 "Temporal_Cortex",
                 "Cerebellum") )%>%
    group_by(disease_group2,tissue_clean) %>%
    mutate(n_sample = n_distinct(sample)) %>%
    ungroup() %>%
    mutate(disease_group2 = gsub("Control", " Control",disease_group2)) %>%
    mutate(detection_name = glue::glue("{disease_group2} \n ( {n_sample} )"))  %>%
    mutate(tissue_clean = fct_relevel(tissue_clean, "Motor_Cortex",
                                      "Frontal_Cortex",
                                      "Temporal_Cortex",
                                      "Cervical_Spinal_Cord",
                                      "Lumbar_Spinal_Cord")) %>% 
  mutate(all_the_fse_reads =  (((UNC13B_nmd_junction_5prime + UNC13B_nmd_junction_3prime)/ library_size) * 10^6)) %>% 
  dplyr::select(tissue_clean,all_the_fse_reads,disease_group2) %>% 
  group_by(tissue_clean) %>% 
  nest()
  

stars_table = for_stats %>% 
  mutate(wilcox_test = map(data, ~{pairwise.wilcox.test(.x$all_the_fse_reads,.x$disease_group2) %>% broom::tidy()})) %>% 
  dplyr::select(-data) %>% 
  unnest() %>% 
  ungroup() %>% 
  mutate(stars = gtools::stars.pval(p.value = p.value)) %>% 
  filter(p.value < 0.05) %>% 
  mutate(y.position = seq(from = 1.5,to = 2.6,length.out = 11))
clean_data_table %>% 
    filter(tissue_clean %in%
               c("Frontal_Cortex",
                 "Lumbar_Spinal_Cord",
                 "Cervical_Spinal_Cord",
                 "Motor_Cortex",
                 "Temporal_Cortex",
                 "Cerebellum") )%>%
    group_by(disease_group2,tissue_clean) %>%
    mutate(n_sample = n_distinct(sample)) %>%
    ungroup() %>%
    mutate(disease_group2 = gsub("Control", " Control",disease_group2)) %>%
    mutate(detection_name = glue::glue("{disease_group2} \n ( {n_sample} )"))  %>%
    mutate(tissue_clean = fct_relevel(tissue_clean, "Motor_Cortex",
                                      "Frontal_Cortex",
                                      "Temporal_Cortex",
                                      "Cervical_Spinal_Cord",
                                      "Lumbar_Spinal_Cord")) %>% 
  mutate(all_the_fse_reads =  (((UNC13B_nmd_junction_5prime + UNC13B_nmd_junction_3prime)/ library_size) * 10^6)) %>% 
  ggplot(aes(x = disease_group2,
               y =all_the_fse_reads,
               fill ="red")) +
    geom_boxplot(show.legend = F,position = position_dodge(preserve = "single",width = 1),outlier.colour = NA) +
    geom_point(size = 2,show.legend = F,pch = 21, position = position_jitterdodge(dodge.width=1,jitter.width = 0.3))+
    ggplot2::facet_wrap(~tissue_clean,scales = "free",nrow = 2) +
    ylab("UNC13B fsE \n reads per million") +
    theme(text = element_text(size = 12)) +
    xlab("") +
    scale_fill_manual(values = colorblind_pal()(4)[2:4]) +
    theme_bw() +
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
    theme(text = element_text(size = 18))  +
    theme(legend.position = "bottom") + 
  stat_pvalue_manual(stars_table, label = "{stars}")

UNC13A and UNC13B Intron Retention in NYGC

stat_test_a = clean_data_table %>% 
    filter(tissue_clean %in% c("Frontal_Cortex", 
                               "Lumbar_Spinal_Cord", 
                               "Cervical_Spinal_Cord", 
                               "Motor_Cortex", 
                               "Temporal_Cortex")) %>% 
    unique() %>% 
    group_by(tissue_clean) %>% 
    wilcox_test(IRratioUNC13A ~ disease_group2) %>%
    add_significance("p")

stat_test_b = clean_data_table %>% 
    filter(tissue_clean %in% c("Frontal_Cortex", 
                               "Lumbar_Spinal_Cord", 
                               "Cervical_Spinal_Cord", 
                               "Motor_Cortex", 
                               "Temporal_Cortex")) %>% 
    unique() %>% 
    group_by(tissue_clean) %>% 
    wilcox_test(IRratioUNC13B ~ disease_group2) %>%
    add_significance("p")

clean_data_table %>% 
    filter(tissue_clean %in% c("Frontal_Cortex", 
                               "Lumbar_Spinal_Cord", 
                               "Cervical_Spinal_Cord", 
                               "Motor_Cortex", 
                               "Temporal_Cortex")) %>% 
    unique() %>% 
    mutate(disease_group2 = fct_relevel(disease_group2, "Control")) %>% 
    ggplot(aes(x = disease_group2,
               y = IRratioUNC13A)) +
    geom_boxplot(aes(fill = disease_group2),outlier.shape = NA) + 
   geom_jitter(aes(fill = disease_group2),
               pch = 21,height = 0,
               width = 0.2,size = 3,alpha = 0.8) +
    facet_wrap(~tissue_clean,scales = 'free_x') +
    stat_pvalue_manual(stat_test_a %>% filter(group1 == "Control" & group2 == "FTLD-TDP"), 
                       label = "p.signif",y.position = 0.98) +
    stat_pvalue_manual(
                       stat_test_a %>% filter(group1 == "ALS-TDP" & group2 == "Control"), 
                       label = "p.signif",y.position = 0.88) +
    ylab("UNC13A IRratio") +
  xlab(element_blank())+
  scale_fill_manual(values = force_colors) +
  ggpubr::theme_pubr() + 
  theme(legend.position = 'none') 

clean_data_table %>% 
    filter(tissue_clean %in% c("Frontal_Cortex", 
                               "Lumbar_Spinal_Cord", 
                               "Cervical_Spinal_Cord", 
                               "Motor_Cortex", 
                               "Temporal_Cortex")) %>% 
    unique() %>% 
    mutate(disease_group2 = fct_relevel(disease_group2, "Control")) %>% 
    ggplot(aes(x = disease_group2,
               y = IRratioUNC13B)) +
    geom_boxplot(aes(fill = disease_group2),outlier.shape = NA) + 
   geom_jitter(aes(fill = disease_group2),
               pch = 21,height = 0,
               width = 0.2,size = 3,alpha = 0.8) +    facet_wrap(~tissue_clean,scales = 'free_x') +
    stat_pvalue_manual(stat_test_b %>% filter(group1 == "Control" & group2 == "FTLD-TDP"), 
                       label = "p.signif",y.position = 0.98) +
    stat_pvalue_manual(
        stat_test_b %>% filter(group1 == "ALS-TDP" & group2 == "Control"), 
        label = "p.signif",y.position = 0.88) +
    ylab("UNC13B IRratio") +
  xlab(element_blank()) +
    scale_fill_manual(values = force_colors) +
    ggpubr::theme_pubr() + 
    theme(legend.position = 'none')

Correlation STMN2 Cryptic PSI and UNC13A TPM

only in ALS/FTLD-TDP

clean_data_table %>%
    filter(disease_tissue == T) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  ggplot(aes(x = stmn_2_cryptic_psi_leaf, y = UNC13A_TPM)) +
  geom_point() +
  stat_cor(size = 8) +
  geom_smooth(method = lm, se = FALSE,color = "black") +
  xlab("STMN2 Cryptic PSI") +
  ylab("UNC13A TPM") +
  ggpubr::theme_pubr() +
  theme(text = element_text(size = 18)) +
  ylim(0,50)
`geom_smooth()` using formula 'y ~ x'

Correlation STMN2 Cryptic PSI and UNC13A TPM - tissue separations

clean_data_table %>%
    filter(disease_tissue == T) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  ggplot(aes(x = stmn_2_cryptic_psi_leaf, y = UNC13A_TPM)) +
  geom_point() +
  stat_cor(size = 8) +
  geom_smooth(method = lm, se = FALSE,color = "black") +
  xlab("STMN2 Cryptic PSI") +
  ylab("UNC13A TPM") +
  ggpubr::theme_pubr() +
  theme(text = element_text(size = 18)) +
  facet_wrap(~tissue_clean,scales = "free_y")
`geom_smooth()` using formula 'y ~ x'

Correlation between UNC13A CE PSI and UNC13A TPM

clean_data_table %>%
  filter(unc13a_cryptic_leaf_psi > 0 ) %>%
  filter(UNC13A_annotated_leaf > 15) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  filter(grepl("Cortex",tissue_clean)) %>%
  filter(unc13a_cryptic_leaf_psi > 0) %>%
  ggplot(aes(x = unc13a_cryptic_leaf_psi, y = UNC13B_TPM)) +
  geom_point() +
  stat_cor(label.y = 20,size = 8,method = 's',cor.coef.name = 'rho') +
  geom_smooth(method = lm, se = FALSE,color = "black") +
  ylab("UNC13A TPM") +
  xlab("UNC13A CE PSI") +
  ggpubr::theme_pubr() +
  theme(text = element_text(size = 22)) +
  scale_x_continuous(labels = scales::percent)
`geom_smooth()` using formula 'y ~ x'

Correlation between UNC13A CE PSI and UNC13A TPM - tissue separated

clean_data_table %>%
  filter(unc13a_cryptic_leaf_psi > 0 ) %>%
  filter(UNC13A_annotated_leaf > 15) %>%
  filter(disease_tissue == T) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  ggplot(aes(x = unc13a_cryptic_leaf_psi, y = UNC13A_TPM, color = disease_group2)) +
  geom_point() +
  stat_cor(size = 8) +
  geom_smooth(method = lm, se = FALSE,color = "black") +
  ylab("UNC13A TPM") +
  xlab("UNC13A CE PSI") +
  ggpubr::theme_pubr() +
  theme(text = element_text(size = 18)) +
  facet_wrap(~tissue_clean,scales = 'free') +
  scale_x_continuous(labels = scales::percent)
`geom_smooth()` using formula 'y ~ x'

Relationship between STMN2 and UNC13A CE PSI

Scatter plot showing the correlation in non-log space UNC13A & STMN2 CE

clean_data_table %>%
  filter(disease_tissue == T) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  filter(junction_reads_stmn2 >= 30) %>%
  filter(junction_reads_unc13a >= 30) %>%
  # filter(UNC13A_3prime_leaf + UNC13A_5prime_1_leaf >= 2) %>%
  # filter(STMN2_cryptic_leaf >= 2) %>%
  filter(grepl("Cortex",tissue_clean)) %>%
  filter(stmn_2_cryptic_psi_leaf > 0 ) %>%
  filter(unc13a_cryptic_leaf_psi > 0) %>%
  ggplot(aes(x = stmn_2_cryptic_psi_leaf, y = unc13a_cryptic_leaf_psi)) +
  geom_point(pch = 21, size = 3) +
  stat_cor(size = 8,method = "spearman",cor.coef.name = "rho") +
  geom_smooth(method = lm, se = FALSE,color = "black") +
  xlab("STMN2 Cryptic PSI") +
  ylab("UNC13A CE PSI") +
  ggpubr::theme_pubr() +
  theme(text = element_text(size = 18)) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1L))  +
  scale_x_continuous(labels = scales::percent_format(accuracy = 1L)) 
`geom_smooth()` using formula 'y ~ x'

## Scatter plot showing the correlation in non-log space RAP1GAP1 & STMN2 CE

clean_data_table %>%
  filter(disease_tissue == T) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  filter(junction_reads_stmn2 >= 30) %>%
  filter(junction_reads_unc13a >= 30) %>%
  # filter(UNC13A_3prime_leaf + UNC13A_5prime_1_leaf >= 2) %>%
  # filter(STMN2_cryptic_leaf >= 2) %>%
  filter(grepl("Cortex",tissue_clean)) %>%
  filter(stmn_2_cryptic_psi_leaf > 0 ) %>%
  filter(unc13a_cryptic_leaf_psi > 0) %>%
  ggplot(aes(x = stmn_2_cryptic_psi_leaf, y = unc13a_cryptic_leaf_psi)) +
  geom_point() +
  stat_cor(size = 8,method = "spearman",cor.coef.name = "rho") +
  geom_smooth(method = lm, se = FALSE,color = "black") +
  xlab("STMN2 Cryptic PSI") +
  ylab("UNC13A CE PSI") +
  ggpubr::theme_pubr() +
  theme(text = element_text(size = 18)) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1L))  +
  scale_x_continuous(labels = scales::percent_format(accuracy = 1L))
`geom_smooth()` using formula 'y ~ x'

Relationship between STMN2 and UNC13B fsE PSI

Scatter plot showing the correlation in non-log space in cortex UNC13B fsE & STMN2 CE

clean_data_table %>%
  filter(disease_tissue == T) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  filter(junction_reads_stmn2 >= 30) %>%
  filter(junction_reads_unc13a >= 30) %>%
  mutate(junction_reads_unc13b = UNC13B_nmd_junction_5prime + UNC13B_annotated + UNC13B_nmd_junction_3prime) %>% 
  filter(junction_reads_unc13b >= 30) %>%
  # filter(UNC13A_3prime_leaf + UNC13A_5prime_1_leaf >= 2) %>%
  # filter(STMN2_cryptic_leaf >= 2) %>%
  filter(grepl("Cortex",tissue_clean)) %>%
  filter(stmn_2_cryptic_psi_leaf > 0 ) %>%
  filter(unc13a_cryptic_leaf_psi > 0) %>%
  ggplot(aes(x = unc13a_cryptic_leaf_psi, y = unc13b_nmd_psi)) +
  geom_point() +
  stat_cor(size = 8,method = "spearman",cor.coef.name = "rho") +
  geom_smooth(method = lm, se = FALSE,color = "black") +
  xlab("UNC13A CE PSI") +
  ylab("UNC13B fsE PSI") +
  ggpubr::theme_pubr() +
  theme(text = element_text(size = 18)) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1L))  +
  scale_x_continuous(labels = scales::percent_format(accuracy = 1L))
`geom_smooth()` using formula 'y ~ x'

Side by side comparison UNC13A & STMN2 CE PSI

clean_data_table %>%
    mutate(tissue_clean = gsub("_"," ",tissue_clean)) %>%
    filter(tissue_clean %in%
               c("Frontal Cortex",
                 "Lumbar Spinal Cord",
                 "Cervical Spinal Cord",
                 "Motor Cortex",
                 "Temporal Cortex",
                 "Cerebellum") )%>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  filter(disease_tissue == T) %>%
  filter(UNC13A_3prime_leaf + UNC13A_5prime_1_leaf >= 1) %>%
  filter(STMN2_cryptic_leaf >= 1) %>%
  dplyr::select(sample,disease_group2,tissue_clean,stmn_2_cryptic_psi_leaf,unc13a_cryptic_leaf_psi) %>%
  melt() %>%
    mutate(tissue_clean = fct_relevel(tissue_clean,"Motor Cortex", "Cervical Spinal Cord",
                                    "Frontal Cortex",
                                    "Lumbar Spinal Cord",
                                    "Temporal Cortex")) %>%
  mutate(variable = ifelse(grepl("stmn_2",variable), "STMN2", "UNC13A")) %>%
  ggplot(aes(x = variable, y = value)) +
  geom_boxplot(aes(color = variable),outlier.shape = NA) +
  geom_jitter(aes(fill = variable),pch = 21,height = 0,width = 0.2,size = 3) +
  scale_fill_manual(values = c("#008574","#cd5582")) +
  scale_color_manual(values = c("#008574","#cd5582")) +
  facet_wrap(~tissue_clean,nrow = 3, scales = 'free') +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1L))  +
  ggpubr::theme_pubr() +
  theme(text = element_text(size = 24))  +
  xlab("") +
  ylab("CE PSI")  +
  theme(legend.position = 'none') + 
  stat_compare_means(label = 'p.signif')

Side by side comparison UNC13A & STMN2 CE PSI v2

clean_data_table %>%
  filter(UNC13A_annotated_leaf > 10) %>%
  filter(STMN2_annotated_leaf > 10) %>%
  # filter(stmn_2_cryptic_psi_leaf > 0 ) %>%
  # filter(unc13a_cryptic_leaf_psi > 0) %>%
  # filter(disease_tissue == T) %>%
    mutate(tissue_clean = gsub("_"," ",tissue_clean)) %>%
  filter(tissue_clean %in% c("Motor Cortex", "Cervical Spinal Cord", "Frontal Cortex", "Lumbar Spinal Cord", "Temporal Cortex")) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  dplyr::select(sample,disease_group2,tissue_clean,stmn_2_cryptic_psi_leaf,unc13a_cryptic_leaf_psi) %>%
  mutate(log2_rat = log2((unc13a_cryptic_leaf_psi + 0.1) / (stmn_2_cryptic_psi_leaf + 0.1))) %>%
  group_by(tissue_clean, disease_group2) %>%
  mutate(average_log2Fold = mean(log2_rat)) %>%
  ungroup() %>%
  dplyr::select(-log2_rat) %>%
  pivot_longer(cols = c("stmn_2_cryptic_psi_leaf","unc13a_cryptic_leaf_psi")) %>%
  mutate(tissue_clean = fct_reorder(tissue_clean,average_log2Fold)) %>%
  mutate(variable = ifelse(grepl("stmn_2",name), "STMN2", "UNC13A")) %>%
  ggplot(aes(x = tissue_clean, y = value,fill = name)) +
  geom_boxplot() +
  geom_point(pch = 21,height = 0,width = 0.2,position = position_jitterdodge()) +
  scale_fill_manual(values = c("#004D40","#882255")) +
  facet_wrap(~tissue_clean+disease_group2,nrow = 1,scales = 'free')  +
  theme(text = element_text(size = 24))  +
  xlab("") +
  ylab("PSI")  +
  theme(legend.position = 'none')

Side by side comparison UNC13A & STMN2 CE PSI - v3

clean_data_table %>%
  # filter(stmn_2_cryptic_psi_leaf > 0 ) %>%
  # filter(unc13a_cryptic_leaf_psi > 0) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  dplyr::select(sample,disease_group2,tissue_clean,stmn_2_cryptic_psi_leaf,unc13a_cryptic_leaf_psi) %>%
  melt() %>%
  mutate(tissue_clean = gsub("_"," ",tissue_clean)) %>%
    mutate(tissue_clean = fct_relevel(tissue_clean,"Motor Cortex", "Cervical Spinal Cord",
                                    "Frontal Cortex",
                                    "Lumbar Spinal Cord",
                                    "Temporal Cortex")) %>%
  mutate(variable = ifelse(grepl("stmn_2",variable), "STMN2", "UNC13A")) %>%
  ggplot(aes(x = variable, y = value)) +
  geom_boxplot(aes(color = variable)) +
  geom_jitter(aes(fill = variable),pch = 21,height = 0,width = 0.2,size = 3) +
  scale_fill_manual(values = c("#004D40","#882255")) +
  scale_color_manual(values = c("#004D40","#882255")) +
  facet_wrap(~tissue_clean+disease_group2,nrow = 3,scales = 'free') +
    scale_y_continuous(labels = scales::percent)  +
  ggpubr::theme_pubr() +
  theme(text = element_text(size = 24))  +
  xlab("") +
  ylab("PSI")  +
  theme(legend.position = 'none')

Log2FC UNC13A & STMN2 CE PSI

clean_data_table %>%
  filter(UNC13A_annotated_leaf > 10) %>%
  filter(STMN2_annotated_leaf > 10) %>%
  filter(stmn_2_cryptic_psi_leaf > 0 ) %>%
  filter(unc13a_cryptic_leaf_psi > 0) %>%
  filter(disease_tissue == T) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  dplyr::select(sample,disease_group2,tissue_clean,stmn_2_cryptic_psi_leaf,unc13a_cryptic_leaf_psi) %>%
  mutate(log2_unc = log2((unc13a_cryptic_leaf_psi)/(stmn_2_cryptic_psi_leaf)))%>%
  mutate(tissue_clean = gsub("_","\n",tissue_clean)) %>%

  # mutate(variable = ifelse(grepl("stmn_2",variable), "STMN2", "UNC13A")) %>%
  ggplot(aes(x = tissue_clean,y = log2_unc, fill = tissue_clean)) +
  geom_boxplot(position = 'dodge') +
  # geom_jitter(height = 0,width = 0.2,pch = 21) +
  facet_wrap(~disease_group2,scales = "free_x") +
  theme_bw() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  theme(text = element_text(size = 24))  +
  xlab("") +
  ylab("PSI")  +
  theme(legend.position =  'top') +
  theme(legend.title=element_blank()) +
  stat_compare_means(comparisons = list(c("STMN2","UNC13A")))

Effect of UNC13A genotype

UNC13A psi by genotype - rs12973192 - RE

rs12973192_comparisons = clean_data_table %>%
  filter(junction_reads_unc13a >=30) %>%
  filter(disease_tissue == T) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  dplyr::select(sample,tissue_clean,disease_group2,rs12973192,stmn_2_cryptic_psi_leaf,unc13a_cryptic_leaf_psi) %>%
  mutate(tissue_clean = gsub("_"," ",tissue_clean)) %>%
  group_by(rs12973192) %>%
  mutate(n_sample = length(unique(sample))) %>%
  ungroup() %>%
  mutate(detection_name = glue::glue("{rs12973192} \n ( {n_sample} )"))

comp_for_gg = rs12973192_comparisons %>% pull(detection_name) %>%
  unique() %>%
  combn(.,2,simplify = F)

ce_psi = rs12973192_comparisons %>%
  ggplot(aes(y = unc13a_cryptic_leaf_psi, x = detection_name, fill = rs12973192)) +
  geom_boxplot() +
  geom_jitter( height = 0) +
  ylab("UNC13A PSI") +
  xlab(element_blank()) +
  scale_fill_manual(values = c("#88CCEE","#44AA99","#1B7739")) +
  ggpubr::theme_pubr() +
  theme(text = element_text(size = 24)) +
  stat_compare_means() +
  theme(legend.position = "none") +
  stat_compare_means(comparisons = comp_for_gg,label = "p.signif") +
  scale_y_continuous(trans='log10',labels = scales::percent_format(accuracy = 1L))  +
  ggtitle("CE SNP (rs12973192)")

UNC13A psi by genotype - rs12608932 - RI

rs12608932_comparisons = clean_data_table %>%
  filter(junction_reads_unc13a >=30) %>%
  filter(disease_tissue == T) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  dplyr::select(sample,tissue_clean,disease_group2,rs12608932,stmn_2_cryptic_psi_leaf,unc13a_cryptic_leaf_psi) %>%
  mutate(tissue_clean = gsub("_"," ",tissue_clean)) %>%
  group_by(rs12608932) %>%
  mutate(n_sample = length(unique(sample))) %>%
  ungroup() %>%
  mutate(detection_name = glue::glue("{rs12608932} \n ( {n_sample} )"))

comp_for_gg = rs12608932_comparisons %>% pull(detection_name) %>%
  unique() %>%
  combn(.,2,simplify = F)

in_psi = rs12608932_comparisons %>%
  ggplot(aes(y = unc13a_cryptic_leaf_psi, x = detection_name, fill = rs12608932)) +
  geom_boxplot() +
  geom_jitter( height = 0) +
  ylab(element_blank()) +
  xlab(element_blank()) +
  scale_fill_manual(values = c("#88CCEE","#44AA99","#1B7739")) +
  ggpubr::theme_pubr() +
  theme(legend.position = "none") +
  theme(text = element_text(size = 24)) +
  stat_compare_means() +
  stat_compare_means(comparisons = comp_for_gg,label = "p.signif") +
    scale_y_continuous(trans='log10',labels = scales::percent_format(accuracy = 1L))  +
  ggtitle("intronic SNP (rs12608932)")
ggpubr::ggarrange(ce_psi,in_psi)

UNC13A psi by genotype - rs12973192

genotype_comparisons = list(c("C/C", "C/G"), c("C/C", "G/G"), c("C/G", "G/G"))

clean_data_table %>%
  filter(disease_tissue == T) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  dplyr::select(sample,tissue_clean,disease_group2,rs12973192,stmn_2_cryptic_psi_leaf,unc13a_cryptic_leaf_psi) %>%
  mutate(tissue_clean = gsub("_"," ",tissue_clean)) %>%
  group_by(rs12973192) %>%
  mutate(n_sample = length(unique(sample))) %>%
  ungroup() %>%
  mutate(detection_name = glue::glue("{rs12973192} \n ( {n_sample} )")) %>%
  ggplot(aes(y = unc13a_cryptic_leaf_psi, x = rs12973192, fill = rs12973192)) +
  geom_boxplot() +
  geom_jitter( height = 0) +
  facet_wrap(~disease_group2,scales = 'free') +
  ylab("UNC13A PSI") +
  xlab("UNC13A rs12973192 Genotype") +
  scale_fill_manual(values = c("#88CCEE","#44AA99","#1B7739")) +
  ggpubr::theme_pubr() +
  theme(text = element_text(size = 24)) +
  stat_compare_means(comparisons = genotype_comparisons,label = "p.signif")

UNC13A psi by genotype - tissue split

genotype_comparisons = list(c("C/C", "C/G"), c("C/C", "G/G"), c("C/G", "G/G"))

clean_data_table %>%
  filter(disease_tissue == T) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  dplyr::select(sample,tissue_clean,disease_group2,rs12973192,stmn_2_cryptic_psi_leaf,unc13a_cryptic_leaf_psi) %>%
  mutate(tissue_clean = gsub("_"," ",tissue_clean)) %>%
  group_by(rs12973192) %>%
  mutate(n_sample = length(unique(sample))) %>%
  ungroup() %>%
  mutate(detection_name = glue::glue("{rs12973192} \n ( {n_sample} )")) %>%
  ggplot(aes(y = unc13a_cryptic_leaf_psi, x = rs12973192, fill = rs12973192)) +
  geom_boxplot() +
  geom_jitter( height = 0) +
  facet_wrap(~disease_group2 + tissue_clean,scales = 'free') +
  ylab("UNC13A PSI") +
  xlab("UNC13A rs12973192 Genotype") +
  scale_fill_manual(values = c("#88CCEE","#44AA99","#1B7739")) +
  ggpubr::theme_pubr() +
  theme(text = element_text(size = 24)) +
  stat_compare_means(comparisons = genotype_comparisons,label = "p.signif")

UNC13A psi in detected tissues by genotype - tissue split

clean_data_table %>%
  filter(unc13a_cryptic_leaf_psi > 0 ) %>%
  filter(disease_tissue == T) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  dplyr::select(sample,tissue_clean,disease_group2,rs12973192,stmn_2_cryptic_psi_leaf,unc13a_cryptic_leaf_psi) %>%
  mutate(tissue_clean = gsub("_"," ",tissue_clean)) %>%
  group_by(rs12973192) %>%
  mutate(n_sample = length(unique(sample))) %>%
  ungroup() %>%
  mutate(detection_name = glue::glue("{rs12973192} \n ( {n_sample} )")) %>%
  ggplot(aes(y = unc13a_cryptic_leaf_psi, x = rs12973192, fill = rs12973192)) +
  geom_boxplot() +
  geom_jitter( height = 0) +
  facet_wrap(~disease_group2 + tissue_clean,scales = 'free') +
  ylab("UNC13A PSI") +
  xlab("UNC13A rs12973192 Genotype") +
  scale_fill_manual(values = c("#88CCEE","#44AA99","#1B7739")) +
  ggpubr::theme_pubr() +
  theme(text = element_text(size = 24)) +
  stat_compare_means(comparisons = genotype_comparisons,label = "p.signif")

STMN2 psi by genotype

clean_data_table %>%
  filter(disease_tissue == T) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  dplyr::select(sample,tissue_clean,disease_group2,rs12973192,stmn_2_cryptic_psi_leaf,unc13a_cryptic_leaf_psi) %>%
  mutate(tissue_clean = gsub("_"," ",tissue_clean)) %>%
  ggplot(aes(y = stmn_2_cryptic_psi_leaf, x = rs12973192, fill = rs12973192)) +
  geom_boxplot() +
  geom_jitter( height = 0) +
  facet_wrap(~disease_group2,scales = 'free') +
  ylab("STMN2 PSI") +
  xlab("UNC13A rs12973192 Genotype") +
  scale_fill_manual(values = c("#88CCEE","#44AA99","#1B7739")) +
  ggpubr::theme_pubr() +
  theme(text = element_text(size = 24)) +
  stat_compare_means(comparisons = genotype_comparisons,label = "p.signif")

STMN2 psi by genotype - tissue split

clean_data_table %>%
  filter(disease_tissue == T) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  dplyr::select(sample,tissue_clean,disease_group2,rs12973192,stmn_2_cryptic_psi_leaf,unc13a_cryptic_leaf_psi) %>%
  mutate(tissue_clean = gsub("_"," ",tissue_clean)) %>%
  ggplot(aes(y = stmn_2_cryptic_psi_leaf, x = rs12973192, fill = rs12973192)) +
  geom_boxplot() +
  geom_jitter( height = 0) +
  facet_wrap(~disease_group2 + tissue_clean,scales = 'free') +
  ylab("STMN2 PSI") +
  xlab("UNC13A rs12973192 Genotype") +
  scale_fill_manual(values = c("#88CCEE","#44AA99","#1B7739")) +
  ggpubr::theme_pubr() +
  theme(text = element_text(size = 24)) +
  stat_compare_means(comparisons = genotype_comparisons,label = "p.signif")

STMN2 psi in detected tissues by genotype - tissue split

clean_data_table %>%
  filter(stmn_2_cryptic_psi_leaf > 0 ) %>%
  filter(disease_tissue == T) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  dplyr::select(sample,tissue_clean,disease_group2,rs12973192,stmn_2_cryptic_psi_leaf,unc13a_cryptic_leaf_psi) %>%
  mutate(tissue_clean = gsub("_"," ",tissue_clean)) %>%
  ggplot(aes(y = stmn_2_cryptic_psi_leaf, x = rs12973192, fill = rs12973192)) +
  geom_boxplot() +
  geom_jitter( height = 0) +
  facet_wrap(~disease_group2 + tissue_clean,scales = 'free') +
  ylab("STMN2 PSI") +
  xlab("UNC13A rs12973192 Genotype") +
  scale_fill_manual(values = c("#88CCEE","#44AA99","#1B7739")) +
  ggpubr::theme_pubr() +
  theme(text = element_text(size = 24)) +
  stat_compare_means(comparisons = genotype_comparisons,label = "p.signif")

Correlation UNC13A PSI and STMN2 PSI with genotype effect - concordants samples

clean_data_table %>%
  filter(!(rs12973192 == "C/G" & rs12608932 == "C/C")) %>%
  mutate(snp_status = case_when(rs12973192 == "C/C" ~ "healthy/healthy",
                                rs12973192 == "C/G" ~ "healthy/risk",
                                rs12973192 == "G/G" ~ "risk/risk")) %>% 
  filter(disease_tissue == T) %>%
  filter(junction_reads_stmn2 >= 30) %>%
  filter(junction_reads_unc13a >= 30) %>%
  filter(grepl("Cortex",tissue_clean)) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
    mutate(rs12973192 = fct_relevel(rs12973192,
                              "C/C", "C/G", "G/G")) %>%
  filter(!grepl("Cord",tissue_clean)) %>%
  mutate(log10_stmn2 = log10(stmn_2_cryptic_psi_leaf)) %>%
    mutate(log10_unc13a = log10(unc13a_cryptic_leaf_psi)) %>%
    ggpubr::ggscatter(.,
                      x = "log10_stmn2",
                      y = "log10_unc13a",
                      fill = 'snp_status',
                      color = 'black', 
                      size = 3,shape = 21,
                      ) +
    ylab("UNC13A CE PSI ") +
    xlab("STMN2 Cryptic PSI ") +
  geom_smooth(method = 'lm',aes(color = snp_status),se = F) + 
  stat_cor(aes( x = stmn_2_cryptic_psi_leaf,
                      y = unc13a_cryptic_leaf_psi,
                      color = snp_status),
           method = 'spearman',
           cor.coef.name = 'rho',
           show.legend = FALSE,
           size = 14) +
    scale_color_manual(values = c("#88CCEE","#44AA99","#1B7739")) +
      scale_fill_manual(values = c("#88CCEE","#44AA99","#1B7739")) +

  theme(legend.title = element_blank()) +
  theme(text = element_text(size = 24),legend.text = element_text(size = 32)) +
    ylab(expression(paste("Lo", g[10], "UNC13A CE PSI"))) +
      xlab(expression(paste("Lo", g[10], "STMN2 CE PSI"))) 
`geom_smooth()` using formula 'y ~ x'

Log2 Fold UNC / STMN2 CE in TDP-path with both detected - rs12973192 - cortex only

detected_table_for_graph = clean_data_table %>%
  filter(!(rs12973192 == "C/G" & rs12608932 == "C/C")) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  filter(unc13a_cryptic_leaf_psi > 0 ) %>%
  filter(stmn_2_cryptic_psi_leaf > 0 ) %>%
  filter(junction_reads_stmn2 >= 30) %>%
  filter(junction_reads_unc13a >= 30) %>%
  filter(grepl("Cortex",tissue_clean)) %>%
  filter(disease_tissue == T) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  dplyr::select(sample,tissue_clean,disease_group2,rs12973192,rs12608932,stmn_2_cryptic_psi_leaf,unc13a_cryptic_leaf_psi) %>%
  mutate(log2_unc = log2((unc13a_cryptic_leaf_psi)/(stmn_2_cryptic_psi_leaf))) %>%
  mutate(tissue_clean = gsub("_","\n",tissue_clean)) %>%
  group_by(rs12973192) %>%
  mutate(n_sample = length(unique(sample))) %>%
  ungroup() %>%
  mutate(detection_name = glue::glue("{rs12973192} \n ( {n_sample} )")) %>%
  mutate(no_cong = ifelse(rs12608932 == "C/C" & rs12973192 == "C/G",8,3)) %>% as.data.table()

#get out that person who is discongruous

comparisons = combn(unique(detected_table_for_graph$detection_name),2,simplify = F)

detected_table_for_graph %>%
  ggplot(aes(y = log2_unc, x = detection_name,fill = detection_name)) +
  geom_boxplot(show.legend = F) +
  geom_jitter(show.legend = F, alpha = 0.8, height = 0,pch = 21) +
  stat_compare_means() +
  stat_compare_means(aes(group = detection_name),comparisons = comparisons,label = "p.signif") +
  ylab("Ratio UNC13A CE PSI / STMN2 PSI") +
  ylab(expression(paste("Lo", g[2], " fold UNC13A CE PSI / STMN2 CE PSI "))) +
  xlab("UNC13A rs12973192 Genotype") +
  scale_fill_manual(values = c("#88CCEE","#44AA99","#1B7739")) +
  ggpubr::theme_pubr() +
  theme(text = element_text(size = 24)) +
  geom_hline(yintercept = 0,size = 1.5)

Log2 Fold UNC / STMN2 CE in TDP-path with both detected - rs12973192 - cortex and cord

detected_table_for_graph = clean_data_table %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  filter(unc13a_cryptic_leaf_psi > 0 ) %>%
  filter(stmn_2_cryptic_psi_leaf > 0 ) %>%
  filter(junction_reads_stmn2 >= 30) %>%
  filter(junction_reads_unc13a >= 30) %>%
  filter(disease_tissue == T) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  dplyr::select(sample,tissue_clean,disease_group2,rs12973192,rs12608932,stmn_2_cryptic_psi_leaf,unc13a_cryptic_leaf_psi) %>%
  mutate(log2_unc = log2((unc13a_cryptic_leaf_psi)/(stmn_2_cryptic_psi_leaf))) %>%
  mutate(tissue_clean = gsub("_","\n",tissue_clean)) %>%
  group_by(rs12973192) %>%
  mutate(n_sample = length(unique(sample))) %>%
  ungroup() %>%
  mutate(detection_name = glue::glue("{rs12973192} \n ( {n_sample} )")) %>%
  mutate(no_cong = ifelse(rs12608932 == "C/C" & rs12973192 == "C/G",8,3))



comparisons = combn(unique(detected_table_for_graph$detection_name),2,simplify = F)

detected_table_for_graph %>%
  ggplot(aes(y = log2_unc, x = detection_name,fill = detection_name)) +
  geom_boxplot(show.legend = F) +
  geom_jitter(show.legend = F, alpha = 0.8, height = 0,pch = 21) +
  stat_compare_means( label = "p.format") +
  stat_compare_means(aes(group = detection_name),comparisons = comparisons,label = "p.signif") +
  ylab("Ratio UNC13A CE PSI / STMN2 PSI") +
  ylab(expression(paste("Lo", g[2], " fold UNC13A CE PSI / STMN2 CE PSI "))) +
  xlab("UNC13A rs12973192 Genotype") +
  scale_fill_manual(values = c("#88CCEE","#44AA99","#1B7739")) +
  ggpubr::theme_pubr() +
  theme(text = element_text(size = 24)) +
  geom_hline(yintercept = 0,size = 1.5)

pretty_effect_plot = function(model, p_value_cutoff = 0.05, title = "Model 1"){
    p_value = Tmisc::lmp(model)
    r_squared = summary(model)$adj.r.squared
    pretty_plot = broom::tidy(model) %>% 
        filter(p.value < p_value_cutoff) %>%
        mutate(term = fct_reorder(term, -abs(estimate))) %>%
        ggplot(aes(x = term, y = estimate,fill = estimate > 0 )) +
        geom_col() +
        coord_flip() +
        ggtitle(title,subtitle = glue::glue("Model p-value: {p_value} \n Adj. R-squared: {r_squared}"))
}

detect2 = clean_data_table %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  filter(unc13a_cryptic_leaf_psi > 0 ) %>%
  filter(stmn_2_cryptic_psi_leaf > 0 ) %>%
  filter(junction_reads_stmn2 >= 30) %>%
  filter(junction_reads_unc13a >= 30) %>%
  filter(disease_tissue == T) %>% 
  mutate(call = fct_relevel(rs12973192,
                              "C/C", "C/G", "G/G")) %>% 
    mutate(number_g_alleles = as.numeric(call)) %>% 
  mutate(age_at_death = as.numeric(age_at_death))

m1 = lm(unc13a_cryptic_leaf_psi ~ stmn_2_cryptic_psi_leaf * 
          number_g_alleles + 
          tissue_clean + 
          age_at_death + 
          sex, 
              detect2)

k = pretty_effect_plot(m1,p_value_cutoff = 0.05)
print(k)

Log2 Fold UNC / STMN2 CE in TDP-path with both detected - rs12608932

detected_table_for_graph_rs12608932 = clean_data_table %>%
  filter(grepl("Cortex",tissue_clean)) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  filter(unc13a_cryptic_leaf_psi > 0 ) %>%
  filter(stmn_2_cryptic_psi_leaf > 0 ) %>%
  filter(junction_reads_stmn2 >= 30) %>%
  filter(junction_reads_unc13a >= 30) %>%
  filter(grepl("Cortex",tissue_clean)) %>%
  filter(disease_tissue == T) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  dplyr::select(sample,tissue_clean,disease_group2,rs12608932,stmn_2_cryptic_psi_leaf,unc13a_cryptic_leaf_psi) %>%
  mutate(log2_unc = log2((unc13a_cryptic_leaf_psi)/(stmn_2_cryptic_psi_leaf))) %>%
  mutate(tissue_clean = gsub("_","\n",tissue_clean)) %>%
  group_by(rs12608932) %>%
  mutate(n_sample = length(unique(sample))) %>%
  ungroup() %>%
  mutate(detection_name = glue::glue("{rs12608932} \n ( {n_sample} )"))



comparisons = combn(unique(detected_table_for_graph_rs12608932$detection_name),2,simplify = F)

detected_table_for_graph_rs12608932 %>%
  ggplot(aes(y = log2_unc, x = detection_name,fill = detection_name)) +
  geom_boxplot(show.legend = F) +
  geom_jitter(show.legend = F, alpha = 0.8, height = 0,pch = 21) +
  stat_compare_means( label = "p.format") +
  stat_compare_means(aes(group = detection_name),comparisons = comparisons,label = "p.signif") +
  ylab("Ratio UNC13A CE PSI / STMN2 PSI") +
  ylab(expression(paste("Lo", g[2], " fold UNC13A CE PSI / STMN2 CE PSI "))) +
  xlab("UNC13A rs12608932 Genotype") +
  scale_fill_manual(values = c("#88CCEE","#44AA99","#1B7739")) +
  ggpubr::theme_pubr() +
  theme(text = element_text(size = 24)) +
  geom_hline(yintercept = 0,size = 1.5)

Log2 Fold UNC / STMN2 CE in TDP-path with both detected - STR - cortex and cord

str_genotype = janitor::clean_names(fread(file.path(here::here(),"data/rs56041637.genotypes.377samples.txt"))) %>% 
  dplyr::rename(participant_id = number_sample_id)

detected_table_for_graph = clean_data_table %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  filter(unc13a_cryptic_leaf_psi > 0 ) %>%
  filter(stmn_2_cryptic_psi_leaf > 0 ) %>%
  filter(junction_reads_stmn2 >= 30) %>%
  filter(junction_reads_unc13a >= 30) %>%
  filter(disease_tissue == T) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  dplyr::select(sample,participant_id,tissue_clean,disease_group2,rs12973192,rs12608932,stmn_2_cryptic_psi_leaf,unc13a_cryptic_leaf_psi) %>%
  mutate(log2_unc = log2((unc13a_cryptic_leaf_psi)/(stmn_2_cryptic_psi_leaf))) %>% 
  mutate(tissue_clean = gsub("_","\n",tissue_clean)) %>% 
  left_join(str_genotype) %>% 
    filter(!is.na(genotype_ci)) %>% unique()
Joining, by = "participant_id"
comparisons = combn(unique(detected_table_for_graph$genotype),2,simplify = F)

detected_table_for_graph %>%
  ggplot(aes(y = log2_unc, x = genotype,fill = genotype)) +
  geom_boxplot(show.legend = F) +
  geom_jitter(show.legend = F, alpha = 0.8, height = 0,pch = 21) +
  stat_compare_means( label = "p.format") +
  stat_compare_means(aes(group = detection_name),comparisons = comparisons,label = "p.signif") +
  ylab("Ratio UNC13A CE PSI / STMN2 PSI") +
  ylab(expression(paste("Lo", g[2], " fold UNC13A CE PSI / STMN2 CE PSI "))) +
  xlab("UNC13A STR Genotype") +
  ggpubr::theme_pubr() +
  theme(text = element_text(size = 24)) +
  geom_hline(yintercept = 0,size = 1.5)

STR by SNP genotype

clean_data_table %>% 
  mutate(presumed_unique_id = gsub("-b38","",participant_id)) %>% 
  select(participant_id,presumed_unique_id,rs12973192,rs12608932) %>% 
  unique() %>% 
  left_join(str_genotype) %>% 
  unique() %>% 
  mutate(genotype = fct_relevel(genotype,"6/6", "6/7", "6/8", "6/9",
                                "6/10","6/12","7/8","8/8","8/10","9/10")) %>% 
  ggplot(aes(x = genotype,fill = rs12973192)) + 
  geom_bar() + 
  scale_fill_manual(values = c("#88CCEE","#44AA99","#1B7739")) +
  ylab("N individuals") +
  xlab("STR genotype")
Joining, by = "participant_id"

Detection rate by genotype

overall_fisher = clean_data_table %>%
    filter(disease_tissue == T) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  mutate(unc13a_detected = unc13a_cryptic_leaf_psi > 0) %>%
  dplyr::select(participant_id,unc13a_detected,rs12973192) %>%
  unique() %>%
  group_by(rs12973192) %>%
  mutate(n_sample = n_distinct(participant_id)) %>%
  mutate(n_sample_detected = sum(unc13a_detected)) %>%
  dplyr::select(rs12973192,n_sample,n_sample_detected) %>%
  unique() %>%
  mutate(n_non_detected = n_sample - n_sample_detected) %>%
  dplyr::select(-n_sample)


over_p = overall_fisher %>%
  column_to_rownames('rs12973192') %>%
  chisq.test() %>%
  broom::tidy() %>%
  .$p.value

overall_fisher %>%
  mutate(n_sample = n_non_detected + n_sample_detected) %>%
  mutate(detection_rate = n_sample_detected / n_sample) %>%
  mutate(detection_name = glue::glue("{rs12973192} \n ( {n_sample} )")) %>%
  ggplot(aes(x = detection_name, y = detection_rate, fill = detection_name)) +
  geom_col(show.legend = F) +
  ggpubr::theme_pubr() +
  scale_y_continuous(labels = scales::percent) +
  ylab("% of TDP-43 Proteionopathy Patients \n UNC13A Cryptic Detected") +
  theme(text = element_text(size = 18)) +
  xlab("N individuals") +
  scale_fill_manual(values = c("#88CCEE","#44AA99","#1B7739")) +
  ggtitle(label = "ALS/FTD-TDP patients",subtitle =  glue::glue("Chi-squared Test: p-value {round(over_p,2)}"))

Although this difference is not significant, with the Fisher’s exact giving a p-value of 0.27.

Variant allele expression in TDP-43 KD experiments

Across our KD’s, we observed a clear allelic bias in the SK-DZ-N cells

kd_vafs = fread(file.path(here::here(),"data","all_kds_unc13.snp.out.tsv"))

kd_vafs %>%
    left_join(rel_rna_cryptic_amount) %>%
    filter(!is.na(condition)) %>%
    mutate(VAF = alt_reads / total) %>%
    filter(condition != "control" ) %>%
    filter(alt_reads > 5) %>%
    ggplot(aes(x = stmn_2_cryptic_psi,y = cryptic_psi, fill = -(VAF - 0.5),size = total)) +
    geom_point(pch = 21)  +
    coord_fixed() +
  ylim(0,0.8) +
  xlim(0,0.8) +
  geom_abline() +
    scale_fill_gradient2()
Joining, by = "sample"

Liu Facs Neurons

Also look at the coverage in the Liu nuclear facs sorted neurons

liu_vafs = fread(file.path(here::here(),"data","liu_facs_neurons_unc13.snp.out.tsv"))
liu_stmn2 = fread(file.path(here::here(),"data","liu_stmn2_and_unc13aaggregated.clean.annotated.bed"))
liu_stmn2[,sample := gsub(".SJ.out","",V4)]
liu_stmn2 = liu_stmn2 %>%
  unique() %>%
    pivot_wider(id_cols = 'sample',
                names_from = "V7",
                values_from = 'V5',
                values_fill = 0)

liu_stmn2 <- liu_stmn2 %>%
   mutate(stmn2_psi = STMN2_cryptic / (STMN2_cryptic + STMN2_annotated)) %>%
   mutate(unc13a_psi = (UNC13A_3prime + UNC13A_5prime + UNC13A_5prime_2)/ (UNC13A_annotated + UNC13A_3prime + UNC13A_5prime + UNC13A_5prime_2))
liu_vafs %>%
    mutate(VAF = alt_reads / total) %>%
    data.table::melt(id.vars = c("sample",'VAF','total')) %>%
    mutate(allele = ifelse(variable == "ref_reads", "C","G")) %>%
    mutate(sample_name = sub("_TDP_.*", "", sample)) %>%
    mutate(sample_name = gsub("_unsorted", "", sample_name)) %>%
    mutate(condition = sub(".*_", "", sample)) %>%
    ggplot(aes(x = condition,y = value, fill = allele)) +
    geom_col(pch = 21, size = 4) +
    facet_wrap(~sample_name,scales = "free")

liu_stmn2 %>%
  dplyr::select(stmn2_psi,sample,unc13a_psi) %>%
  data.table::melt() %>%
   mutate(sample_name = sub("_TDP_.*", "", sample)) %>%
    filter(!grepl("unsorted",sample_name)) %>%
    mutate(condition = sub(".*_", "", sample)) %>%
    mutate(condition = ifelse(grepl("negative",condition), "TDP-43 \nNegative", "TDP-43 \nPositive")) %>%
    mutate(variable = ifelse(grepl("stmn2",variable), "STMN2", "UNC13A")) %>%
    ggplot(aes(x = condition,y = value, fill = variable)) +
    geom_col(size = 4,position = 'dodge') +
    facet_wrap(~sample_name,scales = "free_x") +
    scale_fill_manual(values = c("#004D40","#882255")) +
    theme_bw() +
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
    theme(text = element_text(size = 24))  +
      xlab("") +
    ylab("PSI")  +
  theme(legend.position =  'top') +
  theme(legend.title=element_blank()) +
  stat_compare_means(comparisons = list(c("STMN2","UNC13A")))
Using sample as id variables

liu_stmn2 %>%
  dplyr::select(stmn2_psi,sample,unc13a_psi) %>%
  data.table::melt() %>%
   mutate(sample_name = sub("_TDP_.*", "", sample)) %>%
    filter(!grepl("unsorted",sample_name)) %>%
    mutate(condition = sub(".*_", "", sample)) %>%
    mutate(condition = ifelse(grepl("negative",condition), "TDP-43 \nNegative", "TDP-43 \nPositive")) %>%
    mutate(variable = ifelse(grepl("stmn2",variable), "STMN2", "UNC13A")) %>%
  ggbarplot(,
            x = "condition",
            add = c("mean_se",'jitter'),
            y = "value",
            fill = 'variable',
            color = 'variable',
            position = position_dodge(0.8),dot.size = 6) +
  ggpubr::theme_pubr() +
  scale_color_manual(values = c("#000000","#000000")) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  theme(text = element_text(size = 24))  +
  scale_y_continuous(labels = scales::percent) +
  xlab("") +
  ylab("PSI")  +
  theme(legend.position =  'top') +
  theme(legend.title=element_blank()) +
  stat_compare_means(comparisons = list(c("STMN2","UNC13A")))
Using sample as id variables

liu_stmn2 %>%
   mutate(stmn2_psi = STMN2_cryptic / (STMN2_cryptic + STMN2_annotated)) %>%
   mutate(unc13a_psi = (UNC13A_3prime + UNC13A_5prime)/ (UNC13A_annotated + UNC13A_3prime + UNC13A_5prime)) %>%
   left_join(liu_vafs) %>%
    mutate(alt_vaf = (alt_reads)/total) %>%
    mutate(condition = ifelse(grepl("negative",sample),"negative", "positive")) %>%
             filter(!grepl("unsorted",sample)) %>%

    ggplot(aes(y = unc13a_psi,x = stmn2_psi, fill = alt_vaf)) +
    geom_point(pch = 21,size = 5) +
    ggtitle("STMN2 PSI and the VAF of the G allele \n in the Liu Nuclei ") +
  facet_grid(~condition) +
  scale_fill_viridis_c(option = "plasma") +
  coord_fixed() +
  geom_abline()
Joining, by = "sample"

Variant allele expression in patients

nycg_vafs = fread(file.path(here::here(),"data","NYGC_all_samples_UNC13A_snp_coverage.tsv"),fill = T)
nycg_vafs %>%
    left_join(clean_data_table) %>%
    filter(rs12973192 == "C/G") %>%
    mutate(fraction_risk = alt_reads / total) %>%
    filter(!is.na(fraction_risk)) %>%
    filter(stmn_2_cryptic_psi_leaf > 0 ) %>%
    filter(unc13a_cryptic_leaf_psi > 0 ) %>%
    ggplot(aes(x = stmn_2_cryptic_psi_leaf, y = unc13a_cryptic_leaf_psi)) +
    geom_point(size = 4, pch = 21, aes(fill = fraction_risk - 0.5)) +
    xlab("STMN2 Cryptic PSI") +
    ylab("UNC13A CE PSI") +
    geom_abline() +
    scale_fill_gradient2()
Joining, by = "sample"

nycg_vafs %>%
    left_join(clean_data_table) %>%
    filter(rs12973192 == "C/G") %>%
    mutate(fraction_risk = alt_reads / total) %>%
    filter(!is.na(fraction_risk)) %>%
    filter(stmn_2_cryptic_psi_leaf > 0 ) %>%
    filter(unc13a_cryptic_leaf_psi > 0 ) %>%
    mutate(stmn2_unc_fract = ((unc13a_cryptic_leaf_psi * 100) / (stmn_2_cryptic_psi_leaf * 100))) %>%
    ggplot(aes(x = stmn2_unc_fract, y = fraction_risk)) +
    geom_point(size = 4, pch = 21, aes(fill = fraction_risk - 0.5)) +
    xlab("STMN2 Cryptic PSI") +
    ylab("UNC13A CE PSI") +
    scale_fill_gradient2()
Joining, by = "sample"

Ribosome profiling

Ribosome profiling quality control

TODO: convert all libraries to p_load.

Perform quality control to check that reads are periodic.

read_length_filter <- 29  # length of reads of interest
gencode_v29_info <- fread(paste0(here(), "/data/longest_proteincoding_transcript_hs_details.txt"))
ribo_df <- fread(paste0(here(), "/data/riboseq/sh_tdpb.Aligned.toTranscriptome.out.bam.bed.gz")) 
names(ribo_df)  = c("transcript_id", "start0", "end", "strand")
ribo_df = ribo_df %>% 
    inner_join(gencode_v29_info, by = "transcript_id") %>%
    dplyr::filter(end-start0 == read_length_filter, strand == "+") %>%
    mutate(distance_from_start = start0-cds_start) %>%
    group_by(distance_from_start) %>%
    mutate(n_this_dist = n()) %>%
    dplyr::select(distance_from_start, n_this_dist) %>%
    unique()
ggplot(ribo_df, aes(x=distance_from_start, y=n_this_dist)) +
    geom_line() +
    geom_point() +
    xlim(-50, 50) +
    ylab("Counts") +
    xlab("Distance of 5' end from annotated start codon") +
    ggtitle(paste0("Periodicity of ", read_length_filter, "nt reads from TDP-43 KD replicate B"))

Ribosome profiling volcano plot

Use DESeq to detect differential ribosome footprints and create volcano plots

counts_df <- read_csv(paste0(here(), "/data/riboseq/cds_feature_counts.csv"))
counts <- as.matrix(counts_df %>% column_to_rownames("gene_id"))
ipsc_splicing <- read_csv(paste0(here(), "/data/ipsc_splicing_results.csv"))
ipsc_de = read_csv(paste0(here(), "/data/ipsc_differential_expression.csv"))

col_data <- data.frame(sample = colnames(counts)) %>%
  mutate(condition = ifelse(str_detect(sample, "tdp"), "tdp_ko", "control"))

dds <- DESeqDataSetFromMatrix(countData = counts,
                              colData = col_data,
                              design= ~condition)
dds <- DESeq(dds)
results_name <- resultsNames(dds)[2]
res <- results(dds, name=results_name)

res_df <- data.frame(res) %>%
  rownames_to_column("gene_id") %>%
  left_join(gencode_v29_info, by = "gene_id")

cryptic_df <- ipsc_splicing %>%
  dplyr::select(gene_name, cryptic_junction) %>%
  unique() %>%
  filter(cryptic_junction)

res_df2 <- res_df %>%
  mutate(cryptic_junction = gene_name %in% cryptic_df$gene_name) %>%
  mutate(label_junction = case_when(gene_name %in% c("UNC13A","AGRN",
                                                     "UNC13B","PFKP","SETD5",
                                                     "ATG4B","STMN2","TARDBP") ~ gene_name)) %>%
  mutate(colour = ifelse(cryptic_junction, "#fe6101",  "#648FFF")) %>%
  arrange(colour)
p1 <- ggplot(res_df2, aes(x = log2FoldChange, y = -log10(pvalue), label = label_junction)) +
  geom_point(data = res_df2 %>% filter(!cryptic_junction),
             aes(x = log2FoldChange, y= -log10(pvalue)), color = "#648FFF") +
  geom_point(data = res_df2 %>% filter(cryptic_junction),
             aes(x = log2FoldChange, y= -log10(pvalue)), color = "#fe6101") +
  ggpubr::theme_pubr() +
  ylab("-Log10(p-value)") +
  xlab("Log2 fold change") +
  geom_text_repel(box.padding = 1.2, max.overlaps = Inf)

p2 <- p1 + ylim(0,10)
p1

p2

Targeted RNA-seq

Read in data from processed bam files

TODO: add scripts which deduplicate UMIs for each.

specific_rt <- read_csv(paste0(here(), "/data/targeted_RNA_seq/specific_RT_primer/for_plot.csv")) %>%
    select(risk, sample_name, n)

random_hexamer <- read_csv(paste0(here(), "/data/targeted_RNA_seq/random_hexamer/values_for_plot.csv")) %>%
    select(-sample_name) %>%
    select(sample_name = actual_sample_name, risk, n)

tg_rnaseq_df <- bind_rows(specific_rt, random_hexamer) %>%
  group_by(sample_name) %>%
  mutate(n_healthy = ifelse(risk == "Non-Risk", n, 0)) %>%
  mutate(combined_healthy = sum(n_healthy)) %>%
  select(-n_healthy) %>%
  mutate(combined = sum(n)) %>%
  group_by(sample_name, risk) %>%
  mutate(combined_n = sum(n)) %>%
  ungroup() %>%
  select(-n) %>%
  unique() %>%
  group_by(sample_name) %>%
  mutate(p_value = pbinom(combined_healthy, sum(combined_n), 0.5)) %>%  # single-tailed bionomial test
  ungroup() %>%
  mutate(sample_name_ordered = fct_reorder(sample_name, dplyr::desc(combined_n)))

ggplot(tg_rnaseq_df %>%
         filter(combined_n > 0), aes(x = sample_name_ordered, y = combined_n, fill = risk)) +
  geom_bar(stat="identity", position="dodge") +
  geom_text(aes(x=sample_name, y=-2.5, label=signif(p_value,2)), size = 3.5) +
  xlab("") +
  ylab("Unique cDNAs") +
  ggeasy::easy_add_legend_title("Allele") +
  theme_minimal() +
  ggpubr::theme_pubr() +
  ggeasy::easy_rotate_x_labels()

iCLIP of minigenes

Read in the processed data (following alignment with Bowtie2, and filtering for high-confidence alignments), and generate a plot of the overall distribution of crosslinks, and the rolling average difference between the 2x Risk and 2x Healthy.

Next, filter for peaks in the iCLIP signal, and see how they change in 2x Risk versus 2x Healthy. Highlight those which are close to the SNPs.

TODO: add script which does this filtering TODO: add fasta files for the minigenes

exon_snp <- 556
intron_snp <- 1106
intron_start <- 141
intron_end <- 1429
acceptor1 <- 394
acceptor2 <- 444
donor <- 572
mg_l <- 1624
cpoor_start <- 490
cpoor_end <- 1325
risk_colour <- "#00BFC4"
healthy_colour <- "#F8766D"
confidence_interval <- 0.75  # fracion, i.e. 0.75 = 75%.
peak_threshold <- 5 # how many times above the mean to be a "peak"
region_length <- 50

filtered_df <- read_csv(paste0(here(), "/data/iCLIP/bowtie2_aligned_2_filtered_df.csv"))

rolling_window <- 20
peak_t <- peak_threshold*1000/mg_l  # by definition...

df2 <- filtered_df %>%
  ungroup() %>%
  group_by(condition, pos) %>%
  mutate(mean_counts = sum(normalised_counts)/2) %>%
  select(pos, condition, mean_counts) %>%
  unique() %>%
  pivot_wider(names_from = "condition", values_from ="mean_counts")

df2[is.na(df2)] <- 0

df2 <- df2 %>%
  mutate(diff = R-H) %>%
  mutate(mean_4 = (H+R)/2) %>%
  arrange(pos)

top <- ggplot(df2, aes(x=pos, y=diff)) +
  geom_area(aes(x=pos, y = mean_4), colour="grey50", fill = "grey50", alpha=0.2) +
  geom_vline(xintercept = exon_snp, linetype="dashed") +
  geom_vline(xintercept = intron_snp, linetype="dashed") +
  xlim(1,1600) +
  theme_minimal() +
  ylab("XLs per 1000") +
  xlab("") +
  geom_vline(xintercept = cpoor_start) +
  geom_vline(xintercept = cpoor_end)

bottom <- ggplot(df2, aes(x=pos, y=zoo::rollmean(diff, k = rolling_window, na.pad=T))) +
  geom_area(colour = "grey50", fill="grey50", alpha = 1) +
  geom_vline(xintercept =  exon_snp, linetype="dashed") +
  geom_vline(xintercept =  intron_snp, linetype="dashed") +
  xlim(1,1600) +
  ylim(-0.5,0.75) +
  theme_minimal() +
  ylab("Change in XL density") +
  xlab("Distance from start of minigene") +
  geom_vline(xintercept =  intron_start) +
  geom_vline(xintercept =  intron_end) +
  geom_vline(xintercept =  acceptor1) +
  geom_vline(xintercept =  acceptor2) +
  geom_vline(xintercept =  donor)

combined <- top/bottom
combined

find_ratio_bound <- function(e_x, sem_x, e_y, sem_y, quantile_value, randoms = 10000){
  # monte carlo confidence interval estimate for a ratio. Assumes the two variables
  # x and y are normally distributed. Finds confidence interval for (x-y)/y

  # randomly generate possible global means for x and y

  xs = rnorm(randoms, mean = e_x, sd=sem_x)  # risk
  ys = rnorm(randoms, mean = e_y, sd=sem_y)  # healthy

  ratios = (xs-ys)/ys

  return(quantile(ratios, quantile_value))
}

peak_df <- filtered_df %>%
  ungroup() %>%
  group_by(pos) %>%
  mutate(max_at_pos = max(normalised_counts)) %>%  # at least 1 of the 4 samples above the threshold
  ungroup() %>%
  filter(max_at_pos >= peak_t) %>%  # at least 1 of the 4 samples above the threshold
  group_by(condition, pos) %>%
  mutate(peak_mean = mean(normalised_counts),
         peak_sd = sd(normalised_counts)) %>%
  ungroup() %>%
  select(condition, pos, peak_mean, peak_sd) %>%
  unique() %>%
  pivot_wider(names_from = "condition", values_from = c("peak_mean", "peak_sd")) %>%
  mutate(frac_increase = (peak_mean_R-peak_mean_H)/peak_mean_H) %>%
  mutate(peak_rank = rank(frac_increase)) %>%
  mutate(colour = ifelse(abs(pos-exon_snp)<region_length, "#f7e7abff",
                         ifelse(abs(pos-intron_snp)<region_length, "#c8e1c4ff", "grey50")))

peak_df$upper_bound <- mapply(find_ratio_bound,
                        peak_df$peak_mean_R,
                        peak_df$peak_sd_R/sqrt(2), #convert to SEM
                        peak_df$peak_mean_H,
                        peak_df$peak_sd_H/sqrt(2), #convert to SEM
                        0.5*(1+confidence_interval))

peak_df$lower_bound <- mapply(find_ratio_bound,
                        peak_df$peak_mean_R,
                        peak_df$peak_sd_R/sqrt(2), #convert to SEM
                        peak_df$peak_mean_H,
                        peak_df$peak_sd_H/sqrt(2), #convert to SEM
                        0.5*(1-confidence_interval))

ggplot(peak_df, aes(x = peak_rank, y = 100*frac_increase, fill=colour)) +
  geom_bar(stat="identity",,color = 'black') +
  scale_fill_identity() +
  theme_minimal() +
  ggpubr::theme_pubr()+
  ylab("% change in 2x Risk") +
  xlab("Rank") +
  geom_errorbar(aes(ymin = 100*lower_bound, ymax = 100*upper_bound, x = peak_rank),
                width = 0.3,size = 1.1)

Supplementary iCLIP Figures

bars <- filtered_df %>%
  ungroup() %>%
  group_by(condition, pos) %>%
  mutate(mean_normalised = mean(normalised_counts),
         sd_normalised = sd(normalised_counts)) %>%
  mutate(colour = ifelse(condition == "H", healthy_colour, risk_colour))

bar_top <- ggplot(bars %>% filter(abs(pos-exon_snp) < 35), aes(x = pos, y = mean_normalised,
                                                       fill = colour)) +
  geom_bar(stat="identity", position="dodge") +
  scale_fill_identity() +
  geom_errorbar(aes(ymin = mean_normalised-sd_normalised, ymax=mean_normalised+sd_normalised),
                position="dodge") +
  geom_vline(xintercept = exon_snp, linetype="dashed") +
  ylab("XLs per 1000") +
  ggpubr::theme_pubr() +
  ggtitle("iCLIP signal around CE SNP") +
  xlab("")


bar_bottom <- ggplot(bars %>% filter(abs(pos-intron_snp) < 35), aes(x = pos, y = mean_normalised,
                                                       fill = colour)) +
  geom_bar(stat="identity", position="dodge") +
  scale_fill_identity() +
  geom_errorbar(aes(ymin = mean_normalised-sd_normalised, ymax=mean_normalised+sd_normalised),
                position="dodge") +
  geom_vline(xintercept = intron_snp, linetype="dashed") +
  ggpubr::theme_pubr() +
  ylab("XLs per 1000") +
  ggtitle("iCLIP signal around intronic SNP") +
  xlab("")


bar_combined <- bar_top / bar_bottom
bar_combined

Heptameric analysis

Analyse E-values (higher E value = stronger binding enrichment) for the motifs present at the exon SNP.

# Read in supplementary 4 from Ray et al 2013

data <- read_tsv(paste0(here(), "/data/heptamers/41586_2013_BFnature12311_MOESM334_ESM.txt"))

for(type in c("exon", "intron")){

    if(type == "exon"){
        snp <-     "TAAAAGCATGGAT"
        healthy <- "TAAAAGGATGGAT"
    } else {
        snp <-     "GGATGGGTGGATA"
        healthy <- "GGATGGTTGGATA"
    }

    all_healthy <- c()
    all_risk <- c()

    # Make dataframe containing E scores for each relevant 7mer

    for(i in 1:(str_length(snp)-6)){
        this_snp <- str_sub(snp, i, i+6)
        this_healthy <- str_sub(healthy, i, i+6)

        all_healthy <- c(all_healthy, str_replace_all(this_healthy, "T", "U"))
        all_risk <- c(all_risk, str_replace_all(this_snp, "T", "U"))

        row1 <- data %>% filter(`7mer` == str_replace_all(this_snp, "T", "U")) %>%
            mutate(snp = T, pos = i)

        row2 <- data %>% filter(`7mer` == str_replace_all(this_healthy, "T", "U")) %>%
            mutate(snp = F, pos = i)

        if(i == 1){
            full <- bind_rows(row1, row2)
        } else {
            full <- bind_rows(full, row1, row2)
        }
    }

    # Find average change in E score for the two groups of heptamers

    full2 <- full %>%
        group_by(pos) %>%
        filter(n() == 2) %>%
        ungroup() %>%
        pivot_longer(cols = contains("set")) %>%
        select(-`7mer`) %>%
        pivot_wider(names_from = "snp", values_from = "value") %>%
        mutate(abs_change = `TRUE`-`FALSE`) %>%
        filter(!is.na(`TRUE`) & !is.na(`FALSE`)) %>%
        group_by(name) %>%
        mutate(mean_abs_change = mean(abs_change)) %>%
        select(name, mean_abs_change) %>%
        unique() %>%
        arrange(mean_abs_change) %>%
        mutate(is_tdp43 = ifelse(str_detect(name, "RNCMPT00076"), "red", "grey80")) %>%
        mutate(short_name = str_sub(name, 1, 11)) %>%
        ungroup() %>% group_by(short_name) %>%
        mutate(average_delta = mean(mean_abs_change)) %>%
        select(short_name, is_tdp43, average_delta) %>%
        unique()

    # Plot ranks

    plot <- ggplot(full2, aes(x=rank(average_delta), y = average_delta,
                      fill = is_tdp43)) +
        geom_bar(stat="identity") +
        scale_fill_identity() +
        theme_minimal() +
        ggtitle(paste0("Average change in binding E value for each dataset with SNP for ", type, " SNP")) +
        xlab("") +
        ylab("Average change in E score") +
        ggpubr::theme_pubr() +
        xlab("Rank")

    print(plot)

    full_just_tdp <- full %>%
        group_by(pos) %>%
        filter(n() == 2) %>%
        ungroup() %>%
        select(`7mer`, contains("RNCMPT00076"), snp, pos) %>%
        mutate(mean_E = (RNCMPT00076_e_setA + RNCMPT00076_e_setB)/2) %>%
        pivot_longer(cols = contains("RNCMP")) %>%
        mutate(ypos = ifelse(snp, -0.35, -0.27))

    plot2 <- ggplot(full_just_tdp, aes(x = pos, y = value, colour = snp,
                              label = `7mer`)) +
        geom_point(size = 3) +
        xlab("7mer") +
        ylab("E value (similar to rank?)") +
        ggeasy::easy_add_legend_title("Contains Risk SNP") +
        ggtitle(paste0("Binding E value for TDP43 for all 7x 7mers with or without risk SNP for ", type, " SNP")) +
        geom_label(aes(y = ypos), size = 2.7) +
        xlim(0.5, 7.5) +
        theme_bw() +
        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())  +
        ggeasy::easy_remove_x_axis()

    print(plot2)

}

Isothermal Titration Calorimetry

Read in values, plot and perform a t-test

df <- read_csv(paste0(here(),
                      "/data/itc/itc final values.csv")) 

df2 <- df %>%
    group_by(RNA, experiment) %>%
    mutate(sd2 =ifelse(is.na(sd), sd(kd), sd)) %>%
    mutate(mean_kd = mean(kd)) %>%
  ungroup() %>%
  select(experiment, RNA, mean_kd, sd2) %>%
  unique()

ggplot(df2, aes(x = experiment, fill = RNA, y = mean_kd)) +
    geom_bar(stat="identity", position="dodge") +
    geom_errorbar(aes(x=experiment, ymin = mean_kd-sd2, ymax = mean_kd+sd2,
                      fill = RNA), position="dodge") +
    ylim(0, NA)  +
    geom_jitter(data = df, 
                aes(group=RNA,x = experiment, y = kd,fill = RNA), 
                position=position_jitterdodge(jitter.width = 0.1),pch = 21,size = 3,) +
    ggpubr::theme_pubr() +
    ylab("Kd / nM") +
    xlab("") +
  scale_fill_manual(values = c("#0c9bbdff","#ffb700ff")) 

exon_pval = t.test(df$kd[which(df$RNA=="H" & df$experiment=="exon")], df$kd[which(df$RNA=="R" & df$experiment=="exon")]) %>% broom::tidy() %>% pull(p.value)
intron_pval = t.test(df$kd[which(df$RNA=="H" & df$experiment=="intron")], df$kd[which(df$RNA=="R" & df$experiment=="intron")])  %>% broom::tidy() %>% pull(p.value) # this is an overestimate of significance - however still borderline significant (~0.05) when fitting error is taken into account

P-value two sample t-test:

  • RE - 0.0229142
  • RI - 0.0142136

Expression of ALS/FTD risk genes

als_ftd_genomics = c("ALS2", "ANG", "APP", "AR", "ATN1", "ATXN1", "ATXN10", "ATXN2", 
"ATXN3", "C21orf2", "C9orf72", "CHMP2B", "CSF1R", "DCTN1", "DNAJC5", 
"DNMT1", "EPM2A", "FIG4", "FUS", "GRN", "HNRNPA1", "HTT", "ITM2B", 
"JPH3", "KIF5A", "LOC101927815", "MAPT", "NHLRC1", "NOP56", "NOTCH3", 
"OPTN", "PFN1", "PMP22", "PRNP", "PSEN1", "PSEN2", "SCFD1", "SETX", 
"SIGMAR1", "SLC52A2", "SLC52A3", "SOD1", "TARDBP", "TBK1", "TBP", 
"TNIP1", "TYROBP", "UBQLN2", "UNC13A", "VAPB", "VCP")

ipsc_de = as.data.table(ipsc_de)
ipsc_splicing = as.data.table(ipsc_splicing)

ipsc_de[gene_name %in% als_ftd_genomics][order(log2FoldChange)][,.(log2FoldChange,padj,gene_name,gene_expresion)] %>% mutate(has_cryptic = gene_name %in% ipsc_splicing[cryptic_junction == T,gene_name]) %>% unique() %>% arrange(log2FoldChange) %>% knitr::kable()
log2FoldChange padj gene_name gene_expresion has_cryptic
-1.4755792 0.0000000 TARDBP downregulated FALSE
-0.4651237 0.3129656 AR not_significant FALSE
-0.4522688 0.0000000 PRNP downregulated FALSE
-0.3370146 0.0000000 MAPT downregulated FALSE
-0.2560809 0.0000197 ATXN10 downregulated FALSE
-0.2507640 0.0000114 UNC13A downregulated TRUE
-0.2459440 0.0000023 SIGMAR1 downregulated FALSE
-0.2233069 0.0000475 UBQLN2 downregulated FALSE
-0.2211469 0.0000258 ATN1 downregulated FALSE
-0.1942493 0.0003034 OPTN downregulated FALSE
-0.1628779 0.0024270 KIF5A downregulated FALSE
-0.1617614 0.0012821 DNAJC5 downregulated FALSE
-0.1384970 0.2533432 CHMP2B not_significant FALSE
-0.1220204 0.0515311 DNMT1 downregulated FALSE
-0.1049954 0.0844946 VCP downregulated FALSE
-0.0916095 0.0681453 APP downregulated FALSE
-0.0835958 0.3273028 PSEN2 not_significant FALSE
-0.0790801 0.4084679 FIG4 not_significant FALSE
-0.0663967 0.2592503 FUS not_significant FALSE
-0.0646336 0.3051997 DCTN1 not_significant FALSE
-0.0635069 0.5807439 SCFD1 not_significant FALSE
-0.0623503 0.5686538 SLC52A2 not_significant FALSE
-0.0476461 0.5766988 ATXN2 not_significant FALSE
-0.0423053 0.7469608 C9orf72 not_significant FALSE
-0.0294862 0.6926638 SOD1 not_significant FALSE
-0.0185168 0.8789865 SETX not_significant FALSE
-0.0183617 0.8304551 VAPB not_significant FALSE
0.0045770 0.9596904 JPH3 not_significant FALSE
0.0111296 0.9086136 ITM2B not_significant FALSE
0.0112341 0.9313250 TBP not_significant FALSE
0.0153951 0.8461312 PFN1 not_significant FALSE
0.0234133 0.8051831 HTT not_significant FALSE
0.0323912 0.7842848 C21orf2 not_significant FALSE
0.0536923 0.7306305 EPM2A not_significant FALSE
0.0578489 0.5192551 ATXN1 not_significant FALSE
0.0622795 0.4214349 NOP56 not_significant FALSE
0.0629227 0.4204136 ALS2 not_significant FALSE
0.0996251 0.8525963 NHLRC1 not_significant FALSE
0.1187887 0.0069879 HNRNPA1 upregulated FALSE
0.1295121 0.2157945 TBK1 not_significant FALSE
0.1591301 0.4195443 NOTCH3 not_significant FALSE
0.1801005 0.0032988 PSEN1 upregulated FALSE
0.2001363 0.0291925 ATXN3 upregulated FALSE
0.2340033 0.0005677 GRN upregulated FALSE
0.2560610 0.0152038 TNIP1 upregulated FALSE

Linear model on UNC13A CE PSI with covariates

# library size 
#llumina HiSeq 2500 (125 bp paired end) or an Illumina NovaSeq (100 bp paired end). 
full_cov = clean_data_table %>%
    filter(tissue_clean %in% c("Frontal_Cortex", 
                               "Lumbar_Spinal_Cord", 
                               "Cervical_Spinal_Cord", 
                               "Motor_Cortex", 
                               "Temporal_Cortex")) %>% 
    mutate(inclusion_reads = UNC13A_3prime_leaf + UNC13A_5prime_1_leaf) %>% 
    mutate(unc13a_detected = ifelse(inclusion_reads > 0, yes = "+", no = "-")) %>% 
    filter(sequencing_platform != "") %>% 
    left_join(cell_types %>% 
                  dplyr::select(cell,deconv,sample) %>% 
                  unique() %>% 
                  pivot_wider(id_cols = "sample",names_from = "cell",values_from = "deconv"), by = 'sample') %>%
    filter(!is.na(disease_group2)) %>% 
    unique() %>% 
    filter(disease_group2 %in% c("ALS-TDP","FTLD-TDP")) %>% 
    dplyr::select(c("unc13a_cryptic_leaf_psi",
                    "sample",  "tissue_clean",  
                    "rin",  "sex", 
                    "age",  "mutations", 
                    "sequencing_platform", 
                    "number_g_alleles", 
                    "stmn_2_cryptic_psi_leaf", "oligodendrocytes", 
                    "astrocytes", "neurons", "endothelial", "microglia")) %>% 
    column_to_rownames('sample')
    


fit <- lm(unc13a_cryptic_leaf_psi ~ ., data = full_cov)

fit %>% broom::tidy() %>% fwrite(paste0(here::here(),"/covariate_unc13a_ce.csv"))
full_cov %>% 
    group_by(tissue_clean) %>% 
    nest() %>% 
    mutate(lm_obj = map(data, ~lm(unc13a_cryptic_leaf_psi ~., data = .x))) %>% 
    mutate(broomed = map(lm_obj, broom::tidy)) %>% 
    dplyr::select(tissue_clean,broomed) %>% 
    unnest() %>% 
    filter(p.value < 0.05) %>% knitr::kable()
tissue_clean term estimate std.error statistic p.value
Frontal_Cortex stmn_2_cryptic_psi_leaf 0.5006214 0.0361175 13.860894 0.0000000
Cervical_Spinal_Cord (Intercept) -0.3120120 0.1505643 -2.072284 0.0404359
Cervical_Spinal_Cord sequencing_platformNovaSeq -0.0232872 0.0087809 -2.652043 0.0091104
Cervical_Spinal_Cord stmn_2_cryptic_psi_leaf 0.1016095 0.0510255 1.991348 0.0487726
Motor_Cortex sequencing_platformNovaSeq -0.0055742 0.0018216 -3.060125 0.0025378
Motor_Cortex number_g_alleles 0.0025967 0.0010752 2.415114 0.0166931
Motor_Cortex stmn_2_cryptic_psi_leaf 0.5734449 0.1021909 5.611505 0.0000001
Temporal_Cortex sexMale -0.0173710 0.0069398 -2.503112 0.0166080
Temporal_Cortex age -0.0011040 0.0004588 -2.406180 0.0209579
Temporal_Cortex number_g_alleles 0.0108918 0.0039751 2.740003 0.0092187
unc13a_data_fisher = unc13a_sequencing_platform %>% 
    filter(tissue_clean != "Thoracic_Spinal_Cord") %>% 
    group_by(disease_group2,tissue_clean) %>% 
    nest() %>% 
    mutate(fisher_pvalue = map_dbl(data, quick_fisher)) 



unc13a_data_fisher$data = NULL

unc13a_sequencing_platform_pct = unc13a_sequencing_platform %>%
    group_by(disease_group2,tissue_clean, sequencing_platform, unc13a_detected) %>% tally() %>%
    spread(key = unc13a_detected, value = n,fill = 0) %>%
    mutate(detection = `+` / ( `-` + `+`) ) %>% 
    mutate(total =  ( `-` + `+`) ) %>% 
    mutate(plot_name = glue::glue("{sequencing_platform} \n  ({total})")) %>% 
    left_join(unc13a_data_fisher)
Joining, by = c("disease_group2", "tissue_clean")
unc13a_sequencing_platform_pct %>%
    ggplot(aes(x = plot_name, y = detection )) + 
    geom_col(fill = "firebrick") +
    labs(title = "UNC13A CE detection and sequencing platform", 
         y = "UNC13A CE detected",x = element_blank(), subtitle = "Fisher exact test") + 
    facet_wrap(~disease_group2 + tissue_clean,scales = 'free_x') +
    scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
    geom_text(aes(x = 1.5, y = 1.05, label = paste0("p = ", signif(fisher_pvalue,digits = 2) ) )) +
    theme(legend.text = element_text(size = 15)) +
    ggpubr::theme_pubr()

# scale_y_continuous(expand = c(0,0) ) 

Correlation between STMN2 CE PSI and RAP1GAP1/PFKP

patient_others = fread(file.path(here::here(),"data","nygc_pfkp_unc13b_rap1gap.bed"))


patient_others[,sample := gsub(".SJ.out","",V4)]
clean_data_table = clean_data_table %>% 
    left_join(patient_others %>% 
                  pivot_wider(id_cols = "sample",names_from = "V7",values_from = "V5",values_fill = 0))
Joining, by = c("sample", "RAP1GAP_annotated", "PFKP_annotated", "PFKP_noveldonor", "PFKP_novelacceptor", "RAP1GAP_noveldonor", "RAP1GAP_novelacceptor")
clean_data_table[, rap1gap_cryptic_psi := (RAP1GAP_noveldonor + RAP1GAP_novelacceptor)/ (RAP1GAP_noveldonor + RAP1GAP_novelacceptor + RAP1GAP_annotated)]
clean_data_table[, pfkp_cryptic_psi := (PFKP_novelacceptor + PFKP_noveldonor)/ (PFKP_novelacceptor + PFKP_noveldonor + PFKP_annotated)]



pfkp_plot = clean_data_table %>%
    filter(grepl("Cortex",tissue_clean)) %>%
    filter(disease_tissue == T) %>%
    filter((PFKP_novelacceptor + PFKP_noveldonor + PFKP_annotated) >=30) %>% 
    filter(junction_reads_stmn2 >= 30) %>%
    filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
    filter(stmn_2_cryptic_psi_leaf > 0 ) %>%
    filter(pfkp_cryptic_psi > 0) %>%
    ggplot(aes(x = stmn_2_cryptic_psi_leaf, y = pfkp_cryptic_psi)) +
    geom_point() +
    stat_cor(size = 8,method = "spearman",cor.coef.name = "rho") +
    geom_smooth(method = lm, se = FALSE,color = 'black') +
    xlab("STMN2 Cryptic PSI") +
    ylab("PFKP Cryptic PSI") +
    ggpubr::theme_pubr() +
    theme(text = element_text(size = 18)) +
    scale_y_continuous(labels = scales::percent_format(accuracy = 1L))  +
    scale_x_continuous(labels = scales::percent_format(accuracy = 1L))

rap1gap_plot = clean_data_table %>%
    filter(disease_tissue == T) %>%
    filter(grepl("Cortex",tissue_clean)) %>%
    filter((RAP1GAP_noveldonor + RAP1GAP_novelacceptor + RAP1GAP_annotated) >=30) %>% 
    filter(junction_reads_stmn2 >= 30) %>%
    filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
    filter(stmn_2_cryptic_psi_leaf > 0 ) %>%
    filter(rap1gap_cryptic_psi > 0) %>%
    # ggplot(aes(x = rap1gap_cryptic_psi, y = unc13a_cryptic_leaf_psi)) +
    ggplot(aes(x = stmn_2_cryptic_psi_leaf, y = rap1gap_cryptic_psi)) +
    geom_point() +
    stat_cor(size = 8,method = "spearman",cor.coef.name = "rho") +
    geom_smooth(method = lm, se = FALSE,color = 'black') +
    xlab("STMN2 Cryptic PSI") +
    ylab("RAP1GAP Cryptic PSI") +
    ggpubr::theme_pubr() +
    theme(text = element_text(size = 18)) +
    scale_y_continuous(labels = scales::percent_format(accuracy = 1L))  +
    scale_x_continuous(labels = scales::percent_format(accuracy = 1L))

cowplot::plot_grid(rap1gap_plot,pfkp_plot)
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'

detected_table_for_graph_rap1 = clean_data_table %>%
  filter(!(rs12973192 == "C/G" & rs12608932 == "C/C")) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  filter(rap1gap_cryptic_psi > 0) %>%
  filter(stmn_2_cryptic_psi_leaf > 0 ) %>%
  filter(junction_reads_stmn2 >= 30) %>%
  filter((RAP1GAP_noveldonor + RAP1GAP_novelacceptor + RAP1GAP_annotated) >=30) %>% 
  filter(grepl("Cortex",tissue_clean)) %>%
  filter(disease_tissue == T) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  dplyr::select(sample,tissue_clean,disease_group2,rs12973192,rs12608932,stmn_2_cryptic_psi_leaf,rap1gap_cryptic_psi) %>%
  mutate(log2_rap1gap = log2((rap1gap_cryptic_psi)/(stmn_2_cryptic_psi_leaf))) %>%
  mutate(tissue_clean = gsub("_","\n",tissue_clean)) %>%
  group_by(rs12973192) %>%
  mutate(n_sample = length(unique(sample))) %>%
  ungroup() %>%
  mutate(detection_name = glue::glue("{rs12973192} \n ( {n_sample} )")) %>%
 as.data.table()

#get out that person who is discongruous

comparisons = combn(unique(detected_table_for_graph_rap1$detection_name),2,simplify = F)

rap1_fold_plot = detected_table_for_graph_rap1 %>%
  ggplot(aes(y = log2_rap1gap, x = detection_name,fill = detection_name)) +
  geom_boxplot(show.legend = F) +
  geom_jitter(show.legend = F, alpha = 0.8, height = 0,pch = 21) +
  stat_compare_means(aes(group = detection_name),comparisons = comparisons,label = "p.signif") +
  ylab(expression(paste("Lo", g[2], " fold CE PSI / STMN2 CE PSI "))) +
  xlab("UNC13A rs12973192 Genotype") +
  scale_fill_manual(values = c("#88CCEE","#44AA99","#1B7739")) +
  ggpubr::theme_pubr() +
  theme(text = element_text(size = 24)) +
  geom_hline(yintercept = 0,size = 1.5) + 
  ggtitle("RAP1GAP")
detected_table_for_graph_pfkp = clean_data_table %>%
  filter(!(rs12973192 == "C/G" & rs12608932 == "C/C")) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  filter(pfkp_cryptic_psi > 0) %>%
  filter(stmn_2_cryptic_psi_leaf > 0 ) %>%
  filter(junction_reads_stmn2 >= 30) %>%
    filter((PFKP_novelacceptor + PFKP_noveldonor + PFKP_annotated) >=30) %>% 
  filter(grepl("Cortex",tissue_clean)) %>%
  filter(disease_tissue == T) %>%
  filter(disease_group2 %in% c("FTLD-TDP","ALS-TDP")) %>%
  dplyr::select(sample,tissue_clean,disease_group2,rs12973192,rs12608932,stmn_2_cryptic_psi_leaf,pfkp_cryptic_psi) %>%
  mutate(log2_pfkp = log2((pfkp_cryptic_psi)/(stmn_2_cryptic_psi_leaf))) %>%
  mutate(tissue_clean = gsub("_","\n",tissue_clean)) %>%
  group_by(rs12973192) %>%
  mutate(n_sample = length(unique(sample))) %>%
  ungroup() %>%
  mutate(detection_name = glue::glue("{rs12973192} \n ( {n_sample} )")) %>%
  mutate(no_cong = ifelse(rs12608932 == "C/C" & rs12973192 == "C/G",8,3)) %>% as.data.table()


comparisons = combn(unique(detected_table_for_graph_pfkp$detection_name),2,simplify = F)

pfkp_fold_plot = detected_table_for_graph_pfkp %>%
  ggplot(aes(y = log2_pfkp, x = detection_name,fill = detection_name)) +
  geom_boxplot(show.legend = F) +
  geom_jitter(show.legend = F, alpha = 0.8, height = 0,pch = 21) +
  stat_compare_means(aes(group = detection_name),comparisons = comparisons,label = "p.signif") +
  xlab("UNC13A rs12973192 Genotype") +
  scale_fill_manual(values = c("#88CCEE","#44AA99","#1B7739")) +
  ggpubr::theme_pubr() +
  theme(text = element_text(size = 24)) +
  geom_hline(yintercept = 0,size = 1.5) + 
  ggtitle("PFKP") + 
  ylab(element_blank())

cowplot::plot_grid(rap1_fold_plot,pfkp_fold_plot)

UNC13B isoform expression in-vitro and across the NYGC

iso_colors = c("#1972A9", "#28A919", "#F49F0B")
names(iso_colors) = c("UNC13B-207","UNC13B-208","UNC13B-210")
#207 -- 1972A9 - normal
#208 -- 28A919 - NMD
#210 -- F49F0B - brain
t_type = fread(file.path(here::here(),"data","transcripts-Summary-Homo_sapiens_Gene_Summary_ENSG00000198722.csv"))

cell_lines = fread(file.path(here::here(),"data","unc13b_salmon_cell_lines.csv"))
cell_lines %>% 
    mutate(sample = gsub(".Aligned.sorted.out","",sample)) %>% 
    left_join(rel_rna_cryptic_amount[,.(condition,plot_name,sample)]) %>% 
    left_join(t_type, by = c("Name" = "Transcript ID")) %>% 
    filter(Name.y %in% c("UNC13B-207",
                         "UNC13B-208",
                         "UNC13B-210")) %>%
    ggplot(aes(x =condition, y = TPM,fill = Name.y),color = 'black') + 
    geom_boxplot(size = 1.2,color = "black") +
   geom_point(stroke = 1.2,size = 2.3,pch = 21, position = position_jitterdodge(jitter.width = 0.1,jitter.height = 0)) +
    # facet_wrap(~plot_name) +
  ggpubr::theme_pubr() + 
  labs(fill="",x="") + 
  scale_fill_manual(values = iso_colors) 
Joining, by = "sample"

unc13b_expression = fread(file.path(here::here(),"data","unc13b_transcript_counts.csv"))
unc13b_expression %>% 
    left_join(t_type, by = c("Name" = "Transcript ID")) %>% 
    filter(sample != "") %>% 
    dplyr::select(sample,
                  EffectiveLength,
                  NumReads,
                  Name.y) %>% 
    unique() %>% 
    left_join(clean_data_table[,.(sample,library_size)]) %>% 
    filter(!is.na(library_size)) %>% 
    mutate(TPM = (NumReads )/ (library_size / 10^6)) %>% 
    unique() %>% 
  left_join(clean_data_table %>% 
    select(sample,disease_group2,tissue_clean)) %>% 
    unique() %>% 
    filter(tissue_clean %in% c("Frontal_Cortex", 
                               "Lumbar_Spinal_Cord", 
                               "Cervical_Spinal_Cord", 
                               "Motor_Cortex", 
                               "Temporal_Cortex")) %>% 
    mutate(disease_group2 = fct_relevel(disease_group2, "Control")) %>% 
      filter(Name.y %in% c("UNC13B-207",
                         "UNC13B-208",
                         "UNC13B-210")) %>%
    ggplot(aes(x = disease_group2, y = TPM, fill = Name.y)) + 
    geom_boxplot() +
    geom_point(pch = 21, position = position_jitterdodge(jitter.width = 0.1,jitter.height = 0)) +
    facet_wrap(~tissue_clean,scales = 'free') + 
    theme(axis.text.x = element_text(angle = 45, vjust = 0.5)) + 
    ggpubr::theme_pubr() + 
    labs(fill="",x="") +
    scale_fill_manual(values = iso_colors)
Joining, by = "sample"
Joining, by = "sample"

Nanopore

Read in pre-processed data (see separate script) and plot

df <- read_csv(paste0(here::here(),
                      "/data/nanopore/combined_nanopore_dataframe.csv")) 

df[is.na(df)] <- 0

p2_patient <- ggplot(df %>% filter(classification != "Neither",
                                   type=="Patient"), aes(x = name, y = 100*fraction, fill = class2)) +
    geom_bar(stat="identity", position="dodge") +
    ggpubr::theme_pubr() +
    ggeasy::easy_rotate_x_labels(side = "right") +
    ylab("Percentage of reads") +
    ggtitle("Patient RNA") +
    xlab("") +
    ggeasy::easy_add_legend_title("")

p2_shsy <- ggplot(df %>% filter(classification != "Neither",
                                type=="SHSY5Y"), aes(x = name, y = 100*fraction, fill = class2)) +
    geom_bar(stat="identity", position="dodge") +
    ggpubr::theme_pubr() +
    ggeasy::easy_rotate_x_labels(side = "right") +
    ylab("Percentage of reads") +
    ggtitle("SH-SY5Y") +
    xlab("") +
    ggeasy::easy_add_legend_title("") 

p2_shsy | p2_patient

# without neither removed

p2_patient <- ggplot(df %>% filter(classification != "",
                                   type=="Patient"), aes(x = name, y = 100*fraction, fill = class2,
                                                         colour = class2)) +
    geom_bar(stat="identity", position="dodge") +
    ggpubr::theme_pubr() +
    ggeasy::easy_rotate_x_labels(side = "right") +
    ylab("Percentage of reads") +
    ggtitle("Patient RNA") +
    xlab("") +
    ggeasy::easy_add_legend_title("")

ggplot(df %>% filter(classification != "",
                                type=="SHSY5Y"), aes(x = name, y = fraction, fill = class2,
                                                     colour = class2)) +
    geom_bar(stat="identity", position="dodge") +
    ggpubr::theme_pubr() +
    ggeasy::easy_rotate_x_labels(side = "right") +
    ylab("Percentage of reads") +
    ggtitle("SH-SY5Y") +
    xlab("") +
    ggeasy::easy_add_legend_title("") +
    scale_y_continuous(labels = scales::percent_format(accuracy = 1L))  

p2_shsy | p2_patient

# fractional

df2 <- df %>%
    filter(classification != "Neither") %>%
    group_by(name) %>%
    mutate(f = n/sum(n))

ggplot(df2, aes(x = name, y = 100*f, fill = classification)) +
    geom_bar(stat="identity") +
    ggpubr::theme_pubr() +
    ggeasy::easy_rotate_x_labels(side = "right") +
    ylab("Percentage of reads") +
    xlab("") +
    ggeasy::easy_add_legend_title("")

ggplot(df2 %>%
           filter(type == "Patient"), aes(x = name, y = 100*f, fill = classification)) +
    geom_bar(stat="identity") +
    ggpubr::theme_pubr() +
    ggeasy::easy_rotate_x_labels(side = "right") +
    ylab("Percentage of reads") +
    xlab("") +
    ggeasy::easy_add_legend_title("") 

    ggplot(df2 %>%
           filter(type != "Patient"), aes(x = name, y = 100*f, fill = classification)) +
    geom_bar(stat="identity") +
    ggpubr::theme_pubr() +
    ggeasy::easy_rotate_x_labels(side = "right") +
    ylab("Percentage of reads") +
    xlab("") +
    ggeasy::easy_add_legend_title("")

# fractional
    
    
df %>%
    filter(classification != "Neither") %>%
    group_by(name) %>%
    mutate(f = n/sum(n)) %>% 
  filter(grepl("FTD",name)) %>% 
  ggplot(aes(x = name, y = 100*f, fill = classification)) +
    geom_bar(stat="identity") +
    ggpubr::theme_pubr() +
    ggeasy::easy_rotate_x_labels(side = "right") +
    ylab("Percentage of reads") +
    xlab("") +
    ggeasy::easy_add_legend_title("") +
  scale_fill_manual(values = c("#006613ff",
                               "#ffbf00ff",
                               "#f5671cff")) +
    theme(axis.ticks.length=unit(0.1,"inch"),
        axis.line = element_line(colour = 'black', size = 1.5),
        axis.ticks = element_line(colour = "black", size = 2))

ggplot(df %>% filter(type=="Patient"), 
       aes(x = name, y = 100*fraction, fill = class2)) +
    geom_bar(stat="identity", position="dodge",color = "black") +
    ggpubr::theme_pubr() +
    ggeasy::easy_rotate_x_labels(side = "right") +
    ylab("Percentage of reads") +
    ggtitle("Patient RNA") +
    xlab("") +
    ggeasy::easy_add_legend_title("") +
    scale_fill_manual(values = c("#006613ff",
                               "#ffbf00ff",
                               "#f5671cff",
                               "#290340ff")) +
    theme(axis.ticks.length=unit(0.1,"inch"),
        axis.line = element_line(colour = 'black', size = 1.5),
        axis.ticks = element_line(colour = "black", size = 2))

Different guides strength

guide_strength = fread(file.path(here::here(),"data","different_guides_cryptic_expression.csv"))

guide_strength %>% 
  melt() %>% 
  filter(`cell line` == "i5W") %>% 
   mutate(condition = fct_relevel(condition,
                              "control","tdp43_mild")) %>%
    ggbarplot(,
              x = "condition",
              add = c("mean_se","jitter"),
              y = "value",
              fill = 'variable',
              color = 'variable',
              position = position_dodge(0.8)) +
    ggpubr::theme_pubr() +
      scale_color_manual(
        values = c("#1C2617","#262114","#262114")
    ) +
        scale_fill_manual(
        values = c("#1576b8","#eb7f3f","#975452")
    ) +
    scale_y_continuous(labels = scales::percent) +
  ggtitle("NCRM5") +
         theme(axis.line = element_line(colour = 'black', size = 1.5),
        axis.ticks = element_line(colour = "black", size = 2))

guide_strength %>% 
  melt() %>% 
  filter(`cell line` == "i11w_repeat") %>% 
   mutate(condition = fct_relevel(condition,
                              "control","tdp43_mild")) %>%
    ggbarplot(,
              x = "condition",
              add = c("mean_se","jitter"),
              y = "value",
              fill = 'variable',
              color = 'variable',
              position = position_dodge(0.8)) +
    ggpubr::theme_pubr() +
      scale_color_manual(
        values = c("#1C2617","#262114","#262114")
    ) +
        scale_fill_manual(
        values = c("#1576b8","#eb7f3f","#975452")
    ) +
    scale_y_continuous(labels = scales::percent) +
  ggtitle("WTC11") +
           theme(axis.line = element_line(colour = 'black', size = 1.5),
        axis.ticks = element_line(colour = "black", size = 2))

Dox curves and protein/RNA levels

library(tidyverse)
library(viridis)
Loading required package: viridisLite

Attaching package: 'viridis'
The following object is masked from 'package:scales':

    viridis_pal
library(dplyr)



Protein <- read.csv(file.path(here::here(),"data","Protein_RNA_Corr_3.csv"), colClasses = c("factor","factor","numeric","factor","factor"))

Protein$Treatment <- factor(Protein$Treatment, levels=c("NT","Dox-1","Dox-2","Dox-3","Dox-4","Dox-5"))
Protein$Line_Group <- factor(Protein$Line_Group, levels=c("Protein","RNA","Correct_RNA"))


agg=aggregate(Value~Treatment*Gene*Line_Group*Color_Group, data=Protein, FUN="mean")
std <- function(x) sd(x) / sqrt(length(x))

tmp = Protein %>% 
  group_by(Treatment,Gene,Line_Group,Color_Group) %>% 
  summarise(se = std(Value))
`summarise()` has grouped output by 'Treatment', 'Gene', 'Line_Group'. You can override using the `.groups` argument.
limits <- aes(ymax=Value+se, ymin=Value-se)
dodge <- position_dodge(width=0.9) 


agg = agg %>% 
  filter(Gene != "13A-RNA") %>% 
  mutate(Line_Group = gsub("Correct_RNA","RNA",Line_Group)) %>% 
  left_join(tmp)
Joining, by = c("Treatment", "Gene", "Line_Group", "Color_Group")
ggplot(agg, aes(y=Value, x=Treatment)) +
  geom_line(size = 2,mapping = aes(x=Treatment, y=Value,
                          linetype=Line_Group, group=Gene,color = Color_Group), size=1)+
  geom_errorbar(limits, 
                position=dodge, 
                width=0.24,
                size = 1.1)+
  xlab("") +
  ylab("") +
  ggpubr::theme_pubr()+
  scale_y_continuous(labels = scales::percent_format(scale = 1))+
  scale_colour_viridis_d(begin=0, end=1) +
  labs(linetype = "") +
  theme(axis.ticks.length=unit(0.1,"inch"),
        axis.line = element_line(colour = 'black', size = 1.5),
        axis.ticks = element_line(colour = "black", size = 2))

Endogenous SHSY5Y iCLIP

# Read in filtered bed file (iMAPs output, filtered by script in pre-processing scripts folder)
sh <- read_csv(paste0(here(), "/data/iCLIP/shsy5y_tdp43_iclip_unc13a_b_only.csv"))

── Column specification ────────────────────────────────────────────────────────
cols(
  chr = col_character(),
  start = col_double(),
  end.x = col_double(),
  score1 = col_double(),
  end.y = col_double(),
  score2 = col_double(),
  total = col_double()
)
unc13a <- sh %>%
  filter(chr == "chr19" & start > 17599327 & ((end.x < 17690344 & end.y == 0) | 
                                                (end.y < 17690344 & end.x == 0))) %>%
  select(start,total) %>%
  full_join(data.frame(start=17599327:17690344)) %>%
  arrange(start) 
Joining, by = "start"
unc13a[is.na(unc13a)] <- 0

window <- 20
p_a <- ggplot(unc13a, aes(x = start, y = zoo::rollmean(total, k = window, na.pad=T))) +
  geom_line(alpha = 1) +
  ylim(0, NA) +
    # blue dashed line shows position of CE
  geom_vline(xintercept = 17642430, linetype="dashed", alpha= 0.3, size=2, colour="blue") +
    # red line shows position of retained intron
  geom_vline(xintercept = 17628569, linetype="dashed", alpha= 0.3, size=2, colour="red") +
  ggtitle("UNC13a") +
  ylab(paste("Average XL density in", window, "nt window")) +
  theme_minimal()


unc13b <- sh %>%
  filter(chr == "chr9" & start > 35160008 & ((end.x < 35407338 & end.y == 0) | 
                                                (end.y < 35407338 & end.x == 0))) %>%
  select(start,total) %>%
  full_join(data.frame(start=35160008:35407338)) %>%
  arrange(start)
Joining, by = "start"
unc13b[is.na(unc13b)] <- 0

p_b <- ggplot(unc13b, aes(x = start, y = zoo::rollmean(total, k = window, na.pad=T))) +
  geom_line(alpha = 1) +
  ylim(0, NA) +
    # blue dashed line shows rough position of UG-repeat near fsE
  geom_vline(xintercept = 35364641, linetype="dashed", alpha= 0.3, size=2, colour="blue") +
  theme_minimal() +
  ggtitle("UNC13B") +
  ylab(paste("Average XL density in", window, "nt window"))


p_a / p_b