ChIPtsne2 workflow

Objective

To identify a set of sites where ERG is changing with IK induction.

We’re going to use chipstne2 to help us understand different ways of defining differential sites and change our strategy accordingly.

Setup

suppressPackageStartupMessages({
    library(IKdata)
    library(chiptsne2)
    library(seqsetvis)
    library(GenomicRanges)
    library(data.table)
})

options(mc.cores = 20)
treatment_colors = c("IK" = "purple", GFP = "green")

IKdata provides convenient shortcuts to setup bam and peak files with metadata.

bam_cfg_dt = cutnrun.setup_bam_files()
bam_cfg_dt = bam_cfg_dt[mark == "ERG"]
bam_cfg_dt = bam_cfg_dt[order(treatment)]
bam_cfg_dt$name = factor(bam_cfg_dt$name, levels = unique(bam_cfg_dt$name))
bam_cfg_dt$name = droplevels(bam_cfg_dt$name)

np_cfg_dt = cutnrun.setup_peak_files()
np_cfg_dt = np_cfg_dt[mark == "ERG"]

Peak Overlap Method

Peaks

Load peaks.

peak_grs = easyLoad_narrowPeak(np_cfg_dt$file, file_names = np_cfg_dt$name)
ssvFeatureBars(peak_grs)

ssvFeatureEuler(peak_grs)

Overlap peaks.

olap_grs = ssvConsensusIntervalSets(peak_grs, min_number = 2, min_fraction = 0)

## min_number of 2 used for consensus threshold.

olap_grs = prepare_fetch_GRanges_names(olap_grs)

Define “differential” peaks such that peaks are in both cell lines in one condition and neither cell line in the other condition.

gr_peaks_GFP_only = subset(olap_grs, (PDX2_ERG_GFP_rep1 & MXP5_ERG_GFP_rep1) &  !(PDX2_ERG_IK_rep1 | MXP5_ERG_IK_rep1))
gr_peaks_GFP_only$peak_group = "GFP"
gr_peaks_IK_only = subset(olap_grs, !(PDX2_ERG_GFP_rep1 | MXP5_ERG_GFP_rep1) &  (PDX2_ERG_IK_rep1 & MXP5_ERG_IK_rep1))
gr_peaks_IK_only$peak_group = "IK"

diff_gr = c(
    gr_peaks_GFP_only,
    gr_peaks_IK_only
)

#invert selects regions NOT overlapping
notDiff_gr = subsetByOverlaps(olap_grs, diff_gr, invert = TRUE)
set.seed(0)
notDiff_gr = sample(notDiff_gr, length(diff_gr)/2)
notDiff_gr$peak_group = "not_diff"

query_gr = c(diff_gr, notDiff_gr)

Signal

Creating a FetchConfig to define how to fetch signal allows us to easily create a ChIPtsne2 object.

cfg = FetchConfig(bam_cfg_dt, read_mode = "bam_PE")
ct2.raw = ChIPtsne2.from_FetchConfig(cfg, query_gr)
setPositionVariable(ct2.raw, "bp")
setRegionVariable(ct2.raw, "ERG peak")

So far, ct2.raw only has fetched raw signal, no normalization has been applied, let alone t-SNE.

A standard workflow looks like this:

ct2 = ct2.raw %>% 
    setSeed(0) %>%
    normalizeSignalRPM() %>% 
    calculateSignalCapValue() %>%
    normalizeSignalCapValue() %>% 
    centerProfilesAndRefetch() %>%
    normalizeSignalRPM() %>% 
    calculateSignalCapValue() %>%
    normalizeSignalCapValue() %>%
    flipProfilesToMatch() %>%
    groupRegionsBySignalCluster() %>%
    dimReduceTSNE() %>% 
    groupRegionsByDimReduceCluster() %>%
    setValueVariable("normalized reads")

plotSignalHeatmap(ct2, group_VAR = "peak_group", name_FUN = function(x)x, relative_heatmap_width = .8)

plotSignalLinePlot(ct2, 
                   group_VAR = "peak_group", 
                   facet_VAR = "cell", 
                   color_VAR = "treatment") +
    scale_color_manual(values = treatment_colors)

plotSignalLinePlot(ct2, 
                   group_VAR = "peak_group",
                   facet_VAR = "cell", 
                   color_VAR = "treatment",
                   n_splines = 5)+
    scale_color_manual(values = treatment_colors)

Treatment differences

plotDimReduceSummaryProfiles(ct2, x_bins = 10, color_VAR = "treatment") +
    scale_color_manual(values = treatment_colors) +
    facet_wrap(~cell) +
    theme(panel.background = element_rect(fill = "gray65"), 
          panel.grid = element_line(color = "gray60"))

Cell to cell differences

plotDimReduceSummaryProfiles(ct2, x_bins = 10, color_VAR = "cell") +
    facet_wrap(~treatment)

t-SNE arithmetic

cn = colnames(ct2)
names(cn) = cn
ct2.split = lapply(cn, function(x)ct2[, x])
lapply(ct2.split, dim)

## $PDX2_ERG_GFP_rep1
## [1] 7152    1
## 
## $MXP5_ERG_GFP_rep1
## [1] 7152    1
## 
## $PDX2_ERG_IK_rep1
## [1] 7152    1
## 
## $MXP5_ERG_IK_rep1
## [1] 7152    1

ct2.gfp = (ct2.split$MXP5_ERG_GFP_rep1 + ct2.split$PDX2_ERG_GFP_rep1)/2
ct2.ik = (ct2.split$MXP5_ERG_IK_rep1 + ct2.split$PDX2_ERG_IK_rep1)/2
ct2.diff = ct2.ik - ct2.gfp

p.gfp = plotDimReducePoints(ct2.gfp, has_symmetrical_limits = FALSE)+
    theme(panel.background = element_rect(fill = "gray30"), panel.grid = element_line(color = "gray40"))
p.ik = plotDimReducePoints(ct2.ik, has_symmetrical_limits = FALSE)+
    theme(panel.background = element_rect(fill = "gray30"), panel.grid = element_line(color = "gray40"))
p.diff = plotDimReducePoints(ct2.diff, 
                             has_symmetrical_limits = TRUE, point_color_limits = c(-.5, .5),
                             point_colors = c("green", "gray45", "gray35", "gray45", "purple"))+
    theme(panel.background = element_rect(fill = "gray30"), panel.grid = element_line(color = "gray40"))
p.peaks = plotDimReducePoints(ct2, color_VAR = "peak_group") +
    guides(color = guide_legend(override.aes = list(size = 3))) +
    scale_color_manual(values = c(GFP = "green", IK = "purple", mixed = "gray"))

## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

cowplot::plot_grid(
    p.gfp, p.ik, p.diff, p.peaks, nrow = 2
)

Peak group is not doing a good job capture areas where ERG differs between cell lines.

CSAW

csaw_file = "~/R_workspace.combined/preB_IK_induction/output_csaw.091923/ERG_cutnrun/IK_vs_GFP_in_all.csaw_regions.csv"
dt = fread(csaw_file)
dt = dt[PValue < .05]
dt = dt[direction != "mixed"]
# split(dt, dt$direction)
table(dt$direction)

## 
## down   up 
## 2477 1883

csaw_gr = GRanges(dt)
csaw_gr$up_treatment = ifelse(csaw_gr$direction == "up", "IK", "GFP")
csaw_gr$original_width = width(csaw_gr)

ct_gr = rowRanges(ct2)
olap = findOverlaps(query = ct_gr, subject = csaw_gr)
ct_gr$csaw_group = "ns"
ct_gr$csaw_group[queryHits(olap)] = csaw_gr$up_treatment[subjectHits(olap)]
ct_gr$csaw_width = NA
ct_gr$csaw_width[queryHits(olap)] = csaw_gr$original_width[subjectHits(olap)]
table(ct_gr$csaw_group)

## 
##  GFP   IK   ns 
##  713  221 6218

ct2.csaw1 = setRegionMetaData(ct2, as.data.frame(ct_gr))

## setRegionMetaData ...

plotSignalHeatmap(ct2.csaw1, group_VARS = c("peak_group", "csaw_group"), name_FUN = function(x)x, relative_heatmap_width = .8)

plotSignalLinePlot(ct2.csaw1, 
                   group_VAR = "csaw_group",
                   facet_VAR = "cell", 
                   color_VAR = "treatment",
                   n_splines = 5)+
    scale_color_manual(values = treatment_colors)

cn = colnames(ct2.csaw1)
names(cn) = cn
ct2.csaw1.split = lapply(cn, function(x)ct2.csaw1[, x])
lapply(ct2.csaw1.split, dim)

## $PDX2_ERG_GFP_rep1
## [1] 7152    1
## 
## $MXP5_ERG_GFP_rep1
## [1] 7152    1
## 
## $PDX2_ERG_IK_rep1
## [1] 7152    1
## 
## $MXP5_ERG_IK_rep1
## [1] 7152    1

ct2.csaw1.gfp = (ct2.csaw1.split$MXP5_ERG_GFP_rep1 + ct2.csaw1.split$PDX2_ERG_GFP_rep1)/2
ct2.csaw1.ik = (ct2.csaw1.split$MXP5_ERG_IK_rep1 + ct2.csaw1.split$PDX2_ERG_IK_rep1)/2
ct2.csaw1.diff = ct2.csaw1.ik - ct2.csaw1.gfp

colnames(ct2.csaw1.gfp) = "mean GFP"
colnames(ct2.csaw1.ik) = "mean IK induced"
colnames(ct2.csaw1.diff) = "mean IK induced - GFP"

p.gfp = plotDimReducePoints(ct2.csaw1.gfp, has_symmetrical_limits = FALSE)+
    theme(panel.background = element_rect(fill = "gray30"), panel.grid = element_line(color = "gray40"))
p.ik = plotDimReducePoints(ct2.csaw1.ik, has_symmetrical_limits = FALSE)+
    theme(panel.background = element_rect(fill = "gray30"), panel.grid = element_line(color = "gray40"))
p.diff = plotDimReducePoints(ct2.csaw1.diff, 
                             has_symmetrical_limits = TRUE, point_color_limits = c(-.5, .5),
                             point_colors = c("green", "gray45", "gray35", "gray45", "purple"))+
    theme(panel.background = element_rect(fill = "gray30"), panel.grid = element_line(color = "gray40"))
p.peaks = plotDimReducePoints(ct2.csaw1, color_VAR = "csaw_group") +
    guides(color = guide_legend(override.aes = list(size = 3))) +
    scale_color_manual(values = c(GFP = "green", IK = "purple", mixed = "gray"))

## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

cowplot::plot_grid(
    p.gfp, p.ik, p.diff, p.peaks, nrow = 2
)

Still not good!

plotDimReducePoints(subsetRow(ct2.csaw1, !is.na(csaw_width)), color_VAR = "csaw_width", point_size = 1) +
    scale_color_gradientn(colours = c("blue", "red"))

## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

Low width of csaw results (few diff windows in a row)

Csaw with min width

csaw_file = "~/R_workspace.combined/preB_IK_induction/output_csaw.091923/ERG_cutnrun/IK_vs_GFP_in_all.csaw_regions.csv"
dt = fread(csaw_file)
dt = dt[PValue < .05]
dt = dt[direction != "mixed"]
# split(dt, dt$direction)
table(dt$direction)

## 
## down   up 
## 2477 1883

csaw_gr = GRanges(dt)
csaw_gr = subset(csaw_gr, width >= 1e3)
csaw_gr$up_treatment = ifelse(csaw_gr$direction == "up", "IK", "GFP")
csaw_gr$original_width = width(csaw_gr)

if(FALSE){
    ct_gr = rowRanges(ct2)
    olap = findOverlaps(query = ct_gr, subject = csaw_gr)
    ct_gr$csaw_group = "ns"
    ct_gr$csaw_group[queryHits(olap)] = csaw_gr$up_treatment[subjectHits(olap)]
    ct_gr$csaw_width = NA
    ct_gr$csaw_width[queryHits(olap)] = csaw_gr$original_width[subjectHits(olap)]
    table(ct_gr$csaw_group)
    
    ct2.csaw2 = setRegionMetaData(ct2, as.data.frame(ct_gr))
}


#adding annotation is now easy with GRanges
ct2.csaw2 = ct2 %>%
    addRegionAnnotation(csaw_gr, anno_VAR = "up_treatment", anno_VAR_renames = "csaw_group", no_overlap_value = "ns") %>%
    addRegionAnnotation(csaw_gr, anno_VAR = "original_width", anno_VAR_renames = "csaw_width", no_overlap_value = NA)

## addRegionAnnotation ...
## addRegionAnnotation ...

ct2.csaw3 = ct2 %>%
    addRegionAnnotation(csaw_gr, 
                        anno_VAR = c("up_treatment", "original_width"), 
                        anno_VAR_renames = c("csaw_group", "csaw_width"), 
                        no_overlap_value = c("ns", NA))

## addRegionAnnotation ...

plotSignalHeatmap(ct2.csaw2, group_VARS = c("peak_group", "csaw_group"), name_FUN = function(x)x, relative_heatmap_width = .8)

plotSignalLinePlot(ct2.csaw2, 
                   group_VAR = "csaw_group",
                   facet_VAR = "cell", 
                   color_VAR = "treatment",
                   n_splines = 5)+
    scale_color_manual(values = treatment_colors)

cn = colnames(ct2.csaw2)
names(cn) = cn
ct2.csaw2.split = lapply(cn, function(x)ct2.csaw2[, x])
lapply(ct2.csaw2.split, dim)

## $PDX2_ERG_GFP_rep1
## [1] 7152    1
## 
## $MXP5_ERG_GFP_rep1
## [1] 7152    1
## 
## $PDX2_ERG_IK_rep1
## [1] 7152    1
## 
## $MXP5_ERG_IK_rep1
## [1] 7152    1

ct2.csaw2.gfp = (ct2.csaw2.split$MXP5_ERG_GFP_rep1 + ct2.csaw2.split$PDX2_ERG_GFP_rep1)/2
ct2.csaw2.ik = (ct2.csaw2.split$MXP5_ERG_IK_rep1 + ct2.csaw2.split$PDX2_ERG_IK_rep1)/2
ct2.csaw2.diff = ct2.csaw2.ik - ct2.csaw2.gfp

p.gfp = plotDimReducePoints(ct2.csaw2.gfp, has_symmetrical_limits = FALSE)+
    theme(panel.background = element_rect(fill = "gray30"), panel.grid = element_line(color = "gray40"))
p.ik = plotDimReducePoints(ct2.csaw2.ik, has_symmetrical_limits = FALSE)+
    theme(panel.background = element_rect(fill = "gray30"), panel.grid = element_line(color = "gray40"))
p.diff = plotDimReducePoints(ct2.csaw2.diff, 
                             has_symmetrical_limits = TRUE, point_color_limits = c(-.5, .5),
                             point_colors = c("green", "gray35", "purple"))+
    theme(panel.background = element_rect(fill = "gray30"), panel.grid = element_line(color = "gray40"))
p.peaks = plotDimReducePoints(subsetRow(ct2.csaw2, expr = csaw_group != "ns"), color_VAR = "csaw_group") +
    guides(color = guide_legend(override.aes = list(size = 3))) +
    scale_color_manual(values = c(GFP = "green", IK = "purple", mixed = "gray"))

## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

cowplot::plot_grid(
    p.gfp, p.ik, p.diff, p.peaks, nrow = 2
)