TAPhelpR is intended to help prepare to run TAP and then evaluate and use the outputs.
Prior to running TAP
After running TAP
if(!require("devtools")){
install.packages("devtools")
}
# This optional dependency has fallen out of Bioconductor and must be installed manually.
# It is only required to download data from ENCODE.
devtools::install_github("CharlesJB/ENCODExplorer")
devtools::install_github("FrietzeLabUVM/TAPhelpR")
library(TAPhelpR)
library(tidyverse)
Configuration files are strongly recommended to run TAP. They can control all command line parameters and also set paths to fastq files. You should also take the opportunity to rename samples to something more meaningful and easier to work with. If you have sequencing replicates, they can be aggregated to single samples in the config.
To use TAPhelpR to generate your config file, you need 2 things. A directory with reference files and a directory with fastq.gz files.
The reference directory should be generated using
setup_scripts/setup_new_reference.sh from the directory
where TAP is installed. This script requires 2 minimal inputs with some
additional options. You must provide a single .fasta file and compatible
(same organism assembly) gene reference .gtf/.gff file.
The fastq.gz files must be gzipped should be for both reads if you have paired-end data. They also must have consistent suffixes (i.e. all end in _R1_001.fastq.gz or _R2_001.fastq.gz for PE data or all end in _R1_001.fastq.gz for SE data). It’s OK if your suffixes differ from these defaults, just be sure you specify f1_suffix (and f2_suffix for PE data) when creating the config file.
# if you're running this code locally, tap_in should be where the honeybee .fastq.gz files
# tap_out should any location you wish to write the pipeline outputs to
tap_in = example_honeybee_input()
tap_out = example_honeybee_output()
cfg = config_create(
inDir = tap_in,
f1_suffix = "_1.fastq.gz",
f2_suffix = "_2.fastq.gz",
outDir = tap_out,
singularity = "~/lab_shared/scripts/TAP/tap_latest.sif",
reference = example_honeybee_reference()
)
config_create only gathers the information required to
write the config file. We have the opportunity to add some important
information before writing this config.
In our case, TAPhelpR provides a metadata file to map
SRR accessions to meaningful sample names.
# data.frame from config_create contains fastq file names and default sample name (fastq with suffix removed).
cfg_fq = cfg$fastq_lines
colnames(cfg_fq) = c("file", "srr")
# use merge to add metadata
meta_df = example_honeybee_metadata()
cfg_fq = merge(meta_df, cfg_fq, by = "srr")
cfg_fq = cfg_fq[, c("file", "name")]
# replace fastq data.frame in cfg
cfg$fastq_lines = cfg_fq
Now we want to write the config file.
config_write(cfg, "honeybee_config.csv", overwrite = TRUE)
## [1] "honeybee_config.csv"
Prior to running TAP, it’s a good idea to validate our config file. This validation will not catch all possible errors but focuses on the most common and hardest to diagnose.
## V1 V2 V3 V4
## SRR3123279_1.fastq.gz honeybee 4days worker rep1
## SRR3123281_1.fastq.gz honeybee 4days worker rep2
## SRR3123372_1.fastq.gz honeybee 4days queen rep1
## SRR3123380_1.fastq.gz honeybee 4days queen rep2
Note the output of the above is a data.frame with fastq files as the rownames and the _/. delimited name elements split. This also allows you to verify that your names are well formed. Well formed names have the same number of elements. Poorly formed names will make downstream analysis more painful as you’ll constantly have to add additional code to handle these inconsistencies.
You’ll see a warning if sample names are not well formed.
You will need the ENCODExplorer package installed.
Fetching data consists of 2 steps. Retrieving file information from ENCODE as a data.frame. This data.frame must then be filtered and used to derive file names. Once this is done, the data.frame is used to fetch or download the specified files.
enc_df = ENCODE_get_file_info(request_organisms = "Homo sapiens", request_target_chipseq = "IKZF1")
## Results : 258 files, 6 datasets
enc_df.fe_bw = subset(enc_df, file_format == "bigWig" & output_type == "fold change over control" & assembly == "GRCh38")
enc_df.fe_bw = dplyr::mutate(
enc_df.fe_bw,
rep = ifelse(grepl(";", biological_replicates), "pooled", biological_replicates)
)
DT::datatable(enc_df.fe_bw)
# you must set file_name on the input data.frame
enc_df.fe_bw = dplyr::mutate(
enc_df.fe_bw,
file_name = paste(sep = "_",
biosample_name,
target,
rep,
file_accession,
"FE.bw"
)
)
ENCODE_download_files(enc_df.fe_bw)
Retrieving data from GEO follows the same pattern as getting files from ENCODE except that you start with a valid GEO series accession number, i.e. GSE####.
# you need the GSE accession
gse_df = GEO_get_file_info("GSE152028")
# you typically need to do some cleanup from the title
gse_df = dplyr::mutate(
gse_df,
name = sub(" \\+ ", "&", title)
)
gse_df = dplyr::mutate(
gse_df,
name = sub(" - ", " ", name)
)
gse_df = dplyr::mutate(
gse_df,
name = sub("input_DNA", "input", name)
)
gse_df$name = gsub(" +", "_", gse_df$name)
gse_df$name = paste0(gse_df$name, ".", gse_df$srr)
DT::datatable(gse_df)
GEO_download_files(gse_df$srr, fastq_prefixes = gse_df$name, singularity = "tap_latest.sif", bash_or_sbatch = "bash")
After submitting the TAP pipeline, it’s nice to know how close to
completion it is. Once it’s finished, identifying and debugging any
errors is also critical. TAPhelpR provides a series of
report functions for this purpose.
report_completion is the most basic of these functions.
It simply checks for the existence of 3 files that TAP write to record
its status.
report_completion(tap_out)
## 4 samples detected
## 4 (100%) are no longer in process.
## 4 (100%) have completed successfully.
## name start finish complete
## honeybee_4days_queen_rep1 honeybee_4days_queen_rep1 TRUE TRUE TRUE
## honeybee_4days_queen_rep2 honeybee_4days_queen_rep2 TRUE TRUE TRUE
## honeybee_4days_worker_rep1 honeybee_4days_worker_rep1 TRUE TRUE TRUE
## honeybee_4days_worker_rep2 honeybee_4days_worker_rep2 TRUE TRUE TRUE
Here we see a message reporting the number of samples that have begun running and: 1) how many are still processing and 2) how many have finished with no errors.
If all samples have finished without error there’s no reason to investigate further.
For a more detailed report of progress, use
report_progress. You’ll receive a warning if any steps
failed to complete due to errors.
report_progress(tap_out)
## step honeybee_4days_queen_rep1 honeybee_4days_queen_rep2
## 1: STAR_align complete complete
## 2: bsortindex complete complete
## 3: salmon_quant complete complete
## 4: suppa2 complete complete
## 5: exactSNP complete complete
## 6: make_bigwigs complete complete
## honeybee_4days_worker_rep1 honeybee_4days_worker_rep2
## 1: complete complete
## 2: complete complete
## 3: complete complete
## 4: complete complete
## 5: complete complete
## 6: complete complete
Status plots for a complete run look like this:
report_progress_plot(tap_out)
If there are errors or jobs still in progress you’ll see something like this:
report_progress_plot(example_honeybee_output.in_progress())
Finally, if there were errors, use report_errors.
report_errors(tap_out)
This will return paths to all of the relevant .error log files. You’ll need to investigate these files to diagnose and resolve the errors. You may need to delete affected files but then you can resubmit TAP using the same output. You must allow TAP to finish running or cancel remaining jobs before resubmitting.
TAPhelpR contains several setup_*_files functions to create a convenient data.frame containing file paths and meta data.
bam_files = setup_bam_files(tap_out, variable_map = c("species", "day", "role", "rep"))
The generic equivalent of the above looks like this:
setup_files(tap_out, pattern = "sortedByCoord.out.bam$", variable_map = c("species", "day", "role", "rep"))
## file
## 1: /slipstream_old/home/joeboyd/R_workspace.combined/TAPhelpR.data/honeybee_TAP_output.rename/honeybee_4days_queen_rep1.Aligned.sortedByCoord.out.bam
## 2: /slipstream_old/home/joeboyd/R_workspace.combined/TAPhelpR.data/honeybee_TAP_output.rename/honeybee_4days_queen_rep2.Aligned.sortedByCoord.out.bam
## 3: /slipstream_old/home/joeboyd/R_workspace.combined/TAPhelpR.data/honeybee_TAP_output.rename/honeybee_4days_worker_rep1.Aligned.sortedByCoord.out.bam
## 4: /slipstream_old/home/joeboyd/R_workspace.combined/TAPhelpR.data/honeybee_TAP_output.rename/honeybee_4days_worker_rep2.Aligned.sortedByCoord.out.bam
## species day role rep name
## 1: honeybee 4days queen rep1 honeybee_4days_queen_rep1
## 2: honeybee 4days queen rep2 honeybee_4days_queen_rep2
## 3: honeybee 4days worker rep1 honeybee_4days_worker_rep1
## 4: honeybee 4days worker rep2 honeybee_4days_worker_rep2
suppa_joinFiles(bam_files, by = "role")
suppa_diffSplice(
ref_location = example_honeybee_reference(),
PSI_todo = TAP_SPLICE_EVENTS$SkippingExon)
suppa_diffSplice.within_group(
input_files = bam_files,
within_group = "day",
between_group = "role",
ref_location = example_honeybee_reference(),
PSI_todo = TAP_SPLICE_EVENTS$SkippingExon)
suppa_clusterEvents(PSI_todo = TAP_SPLICE_EVENTS$SkippingExon)
mat = load_counts(tap_out)
head(mat)
## honeybee_4days_queen_rep1 honeybee_4days_queen_rep2
## LOC551580 281 396
## LOC551555 3346 3664
## LOC726347 43 81
## Rfwd3 296 320
## LOC726145 1763 1825
## LOC551448 1101 1256
## honeybee_4days_worker_rep1 honeybee_4days_worker_rep2
## LOC551580 218 213
## LOC551555 2599 2126
## LOC726347 56 59
## Rfwd3 215 185
## LOC726145 1673 1484
## LOC551448 939 674
#if you supply the reference gtf file, or GRanges of same, to load_counts, gene_ids in counts will be aggregated to gene_name
mat.gene_name = load_counts(tap_out, gtf_file = file.path(example_honeybee_reference(), "GTF/current.gtf"), name_attribute = "gene")
library(DESeq2)
meta_df = bam_files
meta_df$file = NULL
meta_df = dplyr::mutate(meta_df, name = paste(species, day, role, rep, sep = "_"))
des = DESeqDataSetFromMatrix(mat[, meta_df$name], colData = meta_df, design = ~role)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
# and run from there ...
stage_output_for_UCSC_tracks(tap_out, track_hosting_dir = "~/public_files/honeybee", overwrite = TRUE)
"~/public_files/honeybee" %>%
dir(full.names = TRUE) %>%
dir(full.names = TRUE) %>%
sample(size = 6)
## [1] "/slipstream/home/joeboyd/public_files/honeybee/raw_positive/6d_queen_A.Aligned.sortedByCoord.out_raw_positive.bw"
## [2] "/slipstream/home/joeboyd/public_files/honeybee/normalized_unstranded/6d_worker_A.Aligned.sortedByCoord.out_normalized_unstranded.bw"
## [3] "/slipstream/home/joeboyd/public_files/honeybee/normalized_positive_showSplice/honeybee_4days_worker_rep1.Aligned.sortedByCoord.out_normalized_positive.showSplice.bw"
## [4] "/slipstream/home/joeboyd/public_files/honeybee/raw_unstranded_showSplice/honeybee_4days_worker_rep2.Aligned.sortedByCoord.out_raw_unstranded.showSplice.bw"
## [5] "/slipstream/home/joeboyd/public_files/honeybee/raw_unstranded_showSplice/6d_worker_B.Aligned.sortedByCoord.out_raw_unstranded.showSplice.bw"
## [6] "/slipstream/home/joeboyd/public_files/honeybee/raw_positive/6d_worker_B.Aligned.sortedByCoord.out_raw_positive.bw"
launch shiny app
launch_UCSC_tracks_app.UVM_galaxy(track_hosting_dir = "~/public_files/honeybee")